diff options
Diffstat (limited to 'kernel')
99 files changed, 11858 insertions, 3513 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 4242366..90b53f6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -93,8 +93,10 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ +obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/async.c b/kernel/async.c index 968ef94..27235f5 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -92,19 +92,18 @@ extern int initcall_debug; static async_cookie_t __lowest_in_progress(struct list_head *running) { struct async_entry *entry; + if (!list_empty(running)) { entry = list_first_entry(running, struct async_entry, list); return entry->cookie; - } else if (!list_empty(&async_pending)) { - entry = list_first_entry(&async_pending, - struct async_entry, list); - return entry->cookie; - } else { - /* nothing in progress... next_cookie is "infinity" */ - return next_cookie; } + list_for_each_entry(entry, &async_pending, list) + if (entry->running == running) + return entry->cookie; + + return next_cookie; /* "infinity" value */ } static async_cookie_t lowest_in_progress(struct list_head *running) diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 6e73517..1f6396d7 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -568,7 +568,7 @@ void audit_trim_trees(void) if (err) goto skip_it; - root_mnt = collect_mounts(path.mnt, path.dentry); + root_mnt = collect_mounts(&path); path_put(&path); if (!root_mnt) goto skip_it; @@ -660,7 +660,7 @@ int audit_add_tree_rule(struct audit_krule *rule) err = kern_path(tree->pathname, 0, &path); if (err) goto Err; - mnt = collect_mounts(path.mnt, path.dentry); + mnt = collect_mounts(&path); path_put(&path); if (!mnt) { err = -ENOMEM; @@ -720,7 +720,7 @@ int audit_tag_tree(char *old, char *new) err = kern_path(new, 0, &path); if (err) return err; - tagged = collect_mounts(path.mnt, path.dentry); + tagged = collect_mounts(&path); path_put(&path); if (!tagged) return -ENOMEM; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a7267bf..3fb789f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -46,6 +46,7 @@ #include <linux/cgroupstats.h> #include <linux/hash.h> #include <linux/namei.h> +#include <linux/smp_lock.h> #include <asm/atomic.h> @@ -900,6 +901,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) struct cgroup *cgrp = &root->top_cgroup; struct cgroup_sb_opts opts; + lock_kernel(); mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); @@ -927,6 +929,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) kfree(opts.release_agent); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + unlock_kernel(); return ret; } diff --git a/kernel/compat.c b/kernel/compat.c index 42d5654..f6c204f0 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, } +asmlinkage long +compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, + struct compat_siginfo __user *uinfo) +{ + siginfo_t info; + + if (copy_siginfo_from_user32(&info, uinfo)) + return -EFAULT; + return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); +} + #ifdef __ARCH_WANT_COMPAT_SYS_TIME /* compat_time_t is a 32 bit "long" and needs to get converted. */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 026facc..d5a7e17 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1857,7 +1857,7 @@ struct cgroup_subsys cpuset_subsys = { int __init cpuset_init_early(void) { - alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed); + alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT); top_cpuset.mems_generation = cpuset_mems_generation++; return 0; diff --git a/kernel/cred.c b/kernel/cred.c index 3a03918..1bb4d7e 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -167,7 +167,7 @@ EXPORT_SYMBOL(prepare_creds); /* * Prepare credentials for current to perform an execve() - * - The caller must hold current->cred_exec_mutex + * - The caller must hold current->cred_guard_mutex */ struct cred *prepare_exec_creds(void) { @@ -276,7 +276,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct cred *new; int ret; - mutex_init(&p->cred_exec_mutex); + mutex_init(&p->cred_guard_mutex); if ( #ifdef CONFIG_KEYS diff --git a/kernel/exit.c b/kernel/exit.c index abf9cf3..b6c90b5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,7 +48,8 @@ #include <linux/tracehook.h> #include <linux/fs_struct.h> #include <linux/init_task.h> -#include <trace/sched.h> +#include <linux/perf_counter.h> +#include <trace/events/sched.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -56,10 +57,6 @@ #include <asm/mmu_context.h> #include "cred-internals.h" -DEFINE_TRACE(sched_process_free); -DEFINE_TRACE(sched_process_exit); -DEFINE_TRACE(sched_process_wait); - static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) @@ -158,6 +155,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); +#ifdef CONFIG_PERF_COUNTERS + WARN_ON_ONCE(tsk->perf_counter_ctxp); +#endif trace_sched_process_free(tsk); put_task_struct(tsk); } @@ -174,6 +174,7 @@ repeat: atomic_dec(&__task_cred(p)->user->processes); proc_flush_task(p); + write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); __exit_signal(p); @@ -975,16 +976,19 @@ NORET_TYPE void do_exit(long code) module_put(tsk->binfmt->module); proc_exit_connector(tsk); + + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(tsk); + exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif #ifdef CONFIG_FUTEX - /* - * This must happen late, after the PID is not - * hashed anymore: - */ if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); if (unlikely(current->pi_state_cache)) @@ -1476,6 +1480,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace, */ if (*notask_error) *notask_error = ret; + return 0; } if (likely(!ptrace) && unlikely(p->ptrace)) { diff --git a/kernel/fork.c b/kernel/fork.c index b9e2edd..4430eb1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -61,8 +61,8 @@ #include <linux/proc_fs.h> #include <linux/blkdev.h> #include <linux/fs_struct.h> -#include <trace/sched.h> #include <linux/magic.h> +#include <linux/perf_counter.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -71,6 +71,8 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include <trace/events/sched.h> + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -83,8 +85,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ -DEFINE_TRACE(sched_process_fork); - int nr_processes(void) { int cpu; @@ -982,6 +982,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!p) goto fork_out; + ftrace_graph_init_task(p); + rt_mutex_init_task(p); #ifdef CONFIG_PROVE_LOCKING @@ -1089,12 +1091,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif - if (unlikely(current->ptrace)) - ptrace_fork(p, clone_flags); + + p->bts = NULL; /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); + retval = perf_counter_init_task(p); + if (retval) + goto bad_fork_cleanup_policy; + if ((retval = audit_alloc(p))) goto bad_fork_cleanup_policy; /* copy all the process information */ @@ -1131,8 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, } } - ftrace_graph_init_task(p); - p->pid = pid_nr(pid); p->tgid = p->pid; if (clone_flags & CLONE_THREAD) @@ -1141,7 +1145,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (current->nsproxy != p->nsproxy) { retval = ns_cgroup_clone(p, pid); if (retval) - goto bad_fork_free_graph; + goto bad_fork_free_pid; } p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; @@ -1233,7 +1237,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_free_graph; + goto bad_fork_free_pid; } if (clone_flags & CLONE_THREAD) { @@ -1268,8 +1272,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, cgroup_post_fork(p); return p; -bad_fork_free_graph: - ftrace_graph_exit_task(p); bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); @@ -1293,6 +1295,7 @@ bad_fork_cleanup_semundo: bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: + perf_counter_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: @@ -1406,10 +1409,16 @@ long do_fork(unsigned long clone_flags, if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); + } else if (!(clone_flags & CLONE_VM)) { + /* + * vfork will do an exec which will call + * set_task_comm() + */ + perf_counter_fork(p); } audit_finish_fork(p); - tracehook_report_clone(trace, regs, clone_flags, nr, p); + tracehook_report_clone(regs, clone_flags, nr, p); /* * We set PF_STARTING at creation in case tracing wants to diff --git a/kernel/futex.c b/kernel/futex.c index eef8cd2..80b5ce7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -19,6 +19,10 @@ * PRIVATE futexes by Eric Dumazet * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> * + * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> + * Copyright (C) IBM Corporation, 2009 + * Thanks to Thomas Gleixner for conceptual design and careful reviews. + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -96,8 +100,8 @@ struct futex_pi_state { */ struct futex_q { struct plist_node list; - /* There can only be a single waiter */ - wait_queue_head_t waiter; + /* Waiter reference */ + struct task_struct *task; /* Which hash list lock to use: */ spinlock_t *lock_ptr; @@ -107,7 +111,9 @@ struct futex_q { /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; - struct task_struct *task; + + /* rt_waiter storage for requeue_pi: */ + struct rt_mutex_waiter *rt_waiter; /* Bitset for the optional bitmasked wakeup */ u32 bitset; @@ -193,6 +199,7 @@ static void drop_futex_key_refs(union futex_key *key) * @uaddr: virtual address of the futex * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED * @key: address where result is stored. + * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE) * * Returns a negative error code or 0 * The key words are stored in *key on success. @@ -203,7 +210,8 @@ static void drop_futex_key_refs(union futex_key *key) * * lock_page() might sleep, the caller should not hold a spinlock. */ -static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) +static int +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -226,7 +234,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) * but access_ok() should be faster than find_vma() */ if (!fshared) { - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) + if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) return -EFAULT; key->private.mm = mm; key->private.address = address; @@ -235,7 +243,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) } again: - err = get_user_pages_fast(address, 1, 0, &page); + err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page); if (err < 0) return err; @@ -276,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key) drop_futex_key_refs(key); } +/** + * futex_top_waiter() - Return the highest priority waiter on a futex + * @hb: the hash bucket the futex_q's reside in + * @key: the futex key (to distinguish it from other futex futex_q's) + * + * Must be called with the hb lock held. + */ +static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, + union futex_key *key) +{ + struct futex_q *this; + + plist_for_each_entry(this, &hb->chain, list) { + if (match_futex(&this->key, key)) + return this; + } + return NULL; +} + static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) { u32 curval; @@ -537,28 +564,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return 0; } +/** + * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex + * @uaddr: the pi futex user address + * @hb: the pi futex hash bucket + * @key: the futex key associated with uaddr and hb + * @ps: the pi_state pointer where we store the result of the + * lookup + * @task: the task to perform the atomic lock work for. This will + * be "current" except in the case of requeue pi. + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Returns: + * 0 - ready to wait + * 1 - acquired the lock + * <0 - error + * + * The hb->lock and futex_key refs shall be held by the caller. + */ +static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, + union futex_key *key, + struct futex_pi_state **ps, + struct task_struct *task, int set_waiters) +{ + int lock_taken, ret, ownerdied = 0; + u32 uval, newval, curval; + +retry: + ret = lock_taken = 0; + + /* + * To avoid races, we attempt to take the lock here again + * (by doing a 0 -> TID atomic cmpxchg), while holding all + * the locks. It will most likely not succeed. + */ + newval = task_pid_vnr(task); + if (set_waiters) + newval |= FUTEX_WAITERS; + + curval = cmpxchg_futex_value_locked(uaddr, 0, newval); + + if (unlikely(curval == -EFAULT)) + return -EFAULT; + + /* + * Detect deadlocks. + */ + if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) + return -EDEADLK; + + /* + * Surprise - we got the lock. Just return to userspace: + */ + if (unlikely(!curval)) + return 1; + + uval = curval; + + /* + * Set the FUTEX_WAITERS flag, so the owner will know it has someone + * to wake at the next unlock. + */ + newval = curval | FUTEX_WAITERS; + + /* + * There are two cases, where a futex might have no owner (the + * owner TID is 0): OWNER_DIED. We take over the futex in this + * case. We also do an unconditional take over, when the owner + * of the futex died. + * + * This is safe as we are protected by the hash bucket lock ! + */ + if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { + /* Keep the OWNER_DIED bit */ + newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); + ownerdied = 0; + lock_taken = 1; + } + + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + + if (unlikely(curval == -EFAULT)) + return -EFAULT; + if (unlikely(curval != uval)) + goto retry; + + /* + * We took the lock due to owner died take over. + */ + if (unlikely(lock_taken)) + return 1; + + /* + * We dont have the lock. Look up the PI state (or create it if + * we are the first waiter): + */ + ret = lookup_pi_state(uval, hb, key, ps); + + if (unlikely(ret)) { + switch (ret) { + case -ESRCH: + /* + * No owner found for this futex. Check if the + * OWNER_DIED bit is set to figure out whether + * this is a robust futex or not. + */ + if (get_futex_value_locked(&curval, uaddr)) + return -EFAULT; + + /* + * We simply start over in case of a robust + * futex. The code above will take the futex + * and return happy. + */ + if (curval & FUTEX_OWNER_DIED) { + ownerdied = 1; + goto retry; + } + default: + break; + } + } + + return ret; +} + /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. */ static void wake_futex(struct futex_q *q) { - plist_del(&q->list, &q->list.plist); + struct task_struct *p = q->task; + /* - * The lock in wake_up_all() is a crucial memory barrier after the - * plist_del() and also before assigning to q->lock_ptr. + * We set q->lock_ptr = NULL _before_ we wake up the task. If + * a non futex wake up happens on another CPU then the task + * might exit and p would dereference a non existing task + * struct. Prevent this by holding a reference on p across the + * wake up. */ - wake_up(&q->waiter); + get_task_struct(p); + + plist_del(&q->list, &q->list.plist); /* - * The waiting task can free the futex_q as soon as this is written, - * without taking any locks. This must come last. - * - * A memory barrier is required here to prevent the following store to - * lock_ptr from getting ahead of the wakeup. Clearing the lock at the - * end of wake_up() does not prevent this store from moving. + * The waiting task can free the futex_q as soon as + * q->lock_ptr = NULL is written, without taking any locks. A + * memory barrier is required here to prevent the following + * store to lock_ptr from getting ahead of the plist_del. */ smp_wmb(); q->lock_ptr = NULL; + + wake_up_state(p, TASK_NORMAL); + put_task_struct(p); } static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) @@ -677,7 +836,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, fshared, &key); + ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ); if (unlikely(ret != 0)) goto out; @@ -687,7 +846,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { - if (this->pi_state) { + if (this->pi_state || this->rt_waiter) { ret = -EINVAL; break; } @@ -723,10 +882,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int ret, op_ret; retry: - ret = get_futex_key(uaddr1, fshared, &key1); + ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) goto out_put_key1; @@ -800,24 +959,185 @@ out: return ret; } -/* - * Requeue all waiters hashed on one physical page to another - * physical page. +/** + * requeue_futex() - Requeue a futex_q from one hb to another + * @q: the futex_q to requeue + * @hb1: the source hash_bucket + * @hb2: the target hash_bucket + * @key2: the new key for the requeued futex_q + */ +static inline +void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key2) +{ + + /* + * If key1 and key2 hash to the same bucket, no need to + * requeue. + */ + if (likely(&hb1->chain != &hb2->chain)) { + plist_del(&q->list, &hb1->chain); + plist_add(&q->list, &hb2->chain); + q->lock_ptr = &hb2->lock; +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.lock = &hb2->lock; +#endif + } + get_futex_key_refs(key2); + q->key = *key2; +} + +/** + * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue + * q: the futex_q + * key: the key of the requeue target futex + * + * During futex_requeue, with requeue_pi=1, it is possible to acquire the + * target futex if it is uncontended or via a lock steal. Set the futex_q key + * to the requeue target futex so the waiter can detect the wakeup on the right + * futex, but remove it from the hb and NULL the rt_waiter so it can detect + * atomic lock acquisition. Must be called with the q->lock_ptr held. + */ +static inline +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +{ + drop_futex_key_refs(&q->key); + get_futex_key_refs(key); + q->key = *key; + + WARN_ON(plist_node_empty(&q->list)); + plist_del(&q->list, &q->list.plist); + + WARN_ON(!q->rt_waiter); + q->rt_waiter = NULL; + + wake_up_state(q->task, TASK_NORMAL); +} + +/** + * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter + * @pifutex: the user address of the to futex + * @hb1: the from futex hash bucket, must be locked by the caller + * @hb2: the to futex hash bucket, must be locked by the caller + * @key1: the from futex key + * @key2: the to futex key + * @ps: address to store the pi_state pointer + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Try and get the lock on behalf of the top waiter if we can do it atomically. + * Wake the top waiter if we succeed. If the caller specified set_waiters, + * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. + * hb1 and hb2 must be held by the caller. + * + * Returns: + * 0 - failed to acquire the lock atomicly + * 1 - acquired the lock + * <0 - error + */ +static int futex_proxy_trylock_atomic(u32 __user *pifutex, + struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, + union futex_key *key1, union futex_key *key2, + struct futex_pi_state **ps, int set_waiters) +{ + struct futex_q *top_waiter = NULL; + u32 curval; + int ret; + + if (get_futex_value_locked(&curval, pifutex)) + return -EFAULT; + + /* + * Find the top_waiter and determine if there are additional waiters. + * If the caller intends to requeue more than 1 waiter to pifutex, + * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, + * as we have means to handle the possible fault. If not, don't set + * the bit unecessarily as it will force the subsequent unlock to enter + * the kernel. + */ + top_waiter = futex_top_waiter(hb1, key1); + + /* There are no waiters, nothing for us to do. */ + if (!top_waiter) + return 0; + + /* + * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in + * the contended case or if set_waiters is 1. The pi_state is returned + * in ps in contended cases. + */ + ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, + set_waiters); + if (ret == 1) + requeue_pi_wake_futex(top_waiter, key2); + + return ret; +} + +/** + * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 + * uaddr1: source futex user address + * uaddr2: target futex user address + * nr_wake: number of waiters to wake (must be 1 for requeue_pi) + * nr_requeue: number of waiters to requeue (0-INT_MAX) + * requeue_pi: if we are attempting to requeue from a non-pi futex to a + * pi futex (pi to pi requeue is not supported) + * + * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire + * uaddr2 atomically on behalf of the top waiter. + * + * Returns: + * >=0 - on success, the number of tasks requeued or woken + * <0 - on error */ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, - int nr_wake, int nr_requeue, u32 *cmpval) + int nr_wake, int nr_requeue, u32 *cmpval, + int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + int drop_count = 0, task_count = 0, ret; + struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct plist_head *head1; struct futex_q *this, *next; - int ret, drop_count = 0; + u32 curval2; + + if (requeue_pi) { + /* + * requeue_pi requires a pi_state, try to allocate it now + * without any locks in case it fails. + */ + if (refill_pi_state_cache()) + return -ENOMEM; + /* + * requeue_pi must wake as many tasks as it can, up to nr_wake + * + nr_requeue, since it acquires the rt_mutex prior to + * returning to userspace, so as to not leave the rt_mutex with + * waiters and no owner. However, second and third wake-ups + * cannot be predicted as they involve race conditions with the + * first wake and a fault while looking up the pi_state. Both + * pthread_cond_signal() and pthread_cond_broadcast() should + * use nr_wake=1. + */ + if (nr_wake != 1) + return -EINVAL; + } retry: - ret = get_futex_key(uaddr1, fshared, &key1); + if (pi_state != NULL) { + /* + * We will have to lookup the pi_state again, so free this one + * to keep the accounting correct. + */ + free_pi_state(pi_state); + pi_state = NULL; + } + + ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, fshared, &key2, + requeue_pi ? VERIFY_WRITE : VERIFY_READ); if (unlikely(ret != 0)) goto out_put_key1; @@ -852,32 +1172,99 @@ retry_private: } } + if (requeue_pi && (task_count - nr_wake < nr_requeue)) { + /* + * Attempt to acquire uaddr2 and wake the top waiter. If we + * intend to requeue waiters, force setting the FUTEX_WAITERS + * bit. We force this here where we are able to easily handle + * faults rather in the requeue loop below. + */ + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, + &key2, &pi_state, nr_requeue); + + /* + * At this point the top_waiter has either taken uaddr2 or is + * waiting on it. If the former, then the pi_state will not + * exist yet, look it up one more time to ensure we have a + * reference to it. + */ + if (ret == 1) { + WARN_ON(pi_state); + task_count++; + ret = get_futex_value_locked(&curval2, uaddr2); + if (!ret) + ret = lookup_pi_state(curval2, hb2, &key2, + &pi_state); + } + + switch (ret) { + case 0: + break; + case -EFAULT: + double_unlock_hb(hb1, hb2); + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + ret = get_user(curval2, uaddr2); + if (!ret) + goto retry; + goto out; + case -EAGAIN: + /* The owner was exiting, try again. */ + double_unlock_hb(hb1, hb2); + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + cond_resched(); + goto retry; + default: + goto out_unlock; + } + } + head1 = &hb1->chain; plist_for_each_entry_safe(this, next, head1, list) { - if (!match_futex (&this->key, &key1)) + if (task_count - nr_wake >= nr_requeue) + break; + + if (!match_futex(&this->key, &key1)) continue; - if (++ret <= nr_wake) { + + WARN_ON(!requeue_pi && this->rt_waiter); + WARN_ON(requeue_pi && !this->rt_waiter); + + /* + * Wake nr_wake waiters. For requeue_pi, if we acquired the + * lock, we already woke the top_waiter. If not, it will be + * woken by futex_unlock_pi(). + */ + if (++task_count <= nr_wake && !requeue_pi) { wake_futex(this); - } else { - /* - * If key1 and key2 hash to the same bucket, no need to - * requeue. - */ - if (likely(head1 != &hb2->chain)) { - plist_del(&this->list, &hb1->chain); - plist_add(&this->list, &hb2->chain); - this->lock_ptr = &hb2->lock; -#ifdef CONFIG_DEBUG_PI_LIST - this->list.plist.lock = &hb2->lock; -#endif - } - this->key = key2; - get_futex_key_refs(&key2); - drop_count++; + continue; + } - if (ret - nr_wake >= nr_requeue) - break; + /* + * Requeue nr_requeue waiters and possibly one more in the case + * of requeue_pi if we couldn't acquire the lock atomically. + */ + if (requeue_pi) { + /* Prepare the waiter to take the rt_mutex. */ + atomic_inc(&pi_state->refcount); + this->pi_state = pi_state; + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, + this->task, 1); + if (ret == 1) { + /* We got the lock. */ + requeue_pi_wake_futex(this, &key2); + continue; + } else if (ret) { + /* -EDEADLK */ + this->pi_state = NULL; + free_pi_state(pi_state); + goto out_unlock; + } } + requeue_futex(this, hb1, hb2, &key2); + drop_count++; } out_unlock: @@ -897,7 +1284,9 @@ out_put_keys: out_put_key1: put_futex_key(fshared, &key1); out: - return ret; + if (pi_state != NULL) + free_pi_state(pi_state); + return ret ? ret : task_count; } /* The key must be already stored in q->key. */ @@ -905,8 +1294,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) { struct futex_hash_bucket *hb; - init_waitqueue_head(&q->waiter); - get_futex_key_refs(&q->key); hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; @@ -1117,35 +1504,149 @@ handle_fault: */ #define FLAGS_SHARED 0x01 #define FLAGS_CLOCKRT 0x02 +#define FLAGS_HAS_TIMEOUT 0x04 static long futex_wait_restart(struct restart_block *restart); -static int futex_wait(u32 __user *uaddr, int fshared, - u32 val, ktime_t *abs_time, u32 bitset, int clockrt) +/** + * fixup_owner() - Post lock pi_state and corner case management + * @uaddr: user address of the futex + * @fshared: whether the futex is shared (1) or not (0) + * @q: futex_q (contains pi_state and access to the rt_mutex) + * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) + * + * After attempting to lock an rt_mutex, this function is called to cleanup + * the pi_state owner as well as handle race conditions that may allow us to + * acquire the lock. Must be called with the hb lock held. + * + * Returns: + * 1 - success, lock taken + * 0 - success, lock not taken + * <0 - on error (-EFAULT) + */ +static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, + int locked) { - struct task_struct *curr = current; - struct restart_block *restart; - DECLARE_WAITQUEUE(wait, curr); - struct futex_hash_bucket *hb; - struct futex_q q; - u32 uval; - int ret; - struct hrtimer_sleeper t; - int rem = 0; + struct task_struct *owner; + int ret = 0; - if (!bitset) - return -EINVAL; + if (locked) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case: + */ + if (q->pi_state->owner != current) + ret = fixup_pi_state_owner(uaddr, q, current, fshared); + goto out; + } - q.pi_state = NULL; - q.bitset = bitset; -retry: - q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key); - if (unlikely(ret != 0)) + /* + * Catch the rare case, where the lock was released when we were on the + * way back before we locked the hash bucket. + */ + if (q->pi_state->owner == current) { + /* + * Try to get the rt_mutex now. This might fail as some other + * task acquired the rt_mutex after we removed ourself from the + * rt_mutex waiters list. + */ + if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { + locked = 1; + goto out; + } + + /* + * pi_state is incorrect, some other task did a lock steal and + * we returned due to timeout or signal without taking the + * rt_mutex. Too late. We can access the rt_mutex_owner without + * locking, as the other task is now blocked on the hash bucket + * lock. Fix the state up. + */ + owner = rt_mutex_owner(&q->pi_state->pi_mutex); + ret = fixup_pi_state_owner(uaddr, q, owner, fshared); goto out; + } -retry_private: - hb = queue_lock(&q); + /* + * Paranoia check. If we did not take the lock, then we should not be + * the owner, nor the pending owner, of the rt_mutex. + */ + if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) + printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " + "pi-state %p\n", ret, + q->pi_state->pi_mutex.owner, + q->pi_state->owner); + +out: + return ret ? ret : locked; +} + +/** + * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal + * @hb: the futex hash bucket, must be locked by the caller + * @q: the futex_q to queue up on + * @timeout: the prepared hrtimer_sleeper, or null for no timeout + */ +static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + struct hrtimer_sleeper *timeout) +{ + queue_me(q, hb); + + /* + * There might have been scheduling since the queue_me(), as we + * cannot hold a spinlock across the get_user() in case it + * faults, and we cannot just set TASK_INTERRUPTIBLE state when + * queueing ourselves into the futex hash. This code thus has to + * rely on the futex_wake() code removing us from hash when it + * wakes us up. + */ + set_current_state(TASK_INTERRUPTIBLE); + + /* Arm the timer */ + if (timeout) { + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } + + /* + * !plist_node_empty() is safe here without any lock. + * q.lock_ptr != 0 is not safe, because of ordering against wakeup. + */ + if (likely(!plist_node_empty(&q->list))) { + /* + * If the timer has already expired, current will already be + * flagged for rescheduling. Only call schedule if there + * is no timeout, or if it has yet to expire. + */ + if (!timeout || timeout->task) + schedule(); + } + __set_current_state(TASK_RUNNING); +} + +/** + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr: the futex userspace address + * @val: the expected value + * @fshared: whether the futex is shared (1) or not (0) + * @q: the associated futex_q + * @hb: storage for hash_bucket pointer to be returned to caller + * + * Setup the futex_q and locate the hash_bucket. Get the futex value and + * compare it with the expected value. Handle atomic faults internally. + * Return with the hb lock held and a q.key reference on success, and unlocked + * with no q.key reference on failure. + * + * Returns: + * 0 - uaddr contains val and hb has been locked + * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked + */ +static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, + struct futex_q *q, struct futex_hash_bucket **hb) +{ + u32 uval; + int ret; /* * Access the page AFTER the hash-bucket is locked. @@ -1163,95 +1664,83 @@ retry_private: * A consequence is that futex_wait() can return zero and absorb * a wakeup when *uaddr != val on entry to the syscall. This is * rare, but normal. - * - * For shared futexes, we hold the mmap semaphore, so the mapping - * cannot have changed since we looked it up in get_futex_key. */ +retry: + q->key = FUTEX_KEY_INIT; + ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); + if (unlikely(ret != 0)) + return ret; + +retry_private: + *hb = queue_lock(q); + ret = get_futex_value_locked(&uval, uaddr); - if (unlikely(ret)) { - queue_unlock(&q, hb); + if (ret) { + queue_unlock(q, *hb); ret = get_user(uval, uaddr); if (ret) - goto out_put_key; + goto out; if (!fshared) goto retry_private; - put_futex_key(fshared, &q.key); + put_futex_key(fshared, &q->key); goto retry; } - ret = -EWOULDBLOCK; - if (unlikely(uval != val)) { - queue_unlock(&q, hb); - goto out_put_key; - } - /* Only actually queue if *uaddr contained val. */ - queue_me(&q, hb); + if (uval != val) { + queue_unlock(q, *hb); + ret = -EWOULDBLOCK; + } - /* - * There might have been scheduling since the queue_me(), as we - * cannot hold a spinlock across the get_user() in case it - * faults, and we cannot just set TASK_INTERRUPTIBLE state when - * queueing ourselves into the futex hash. This code thus has to - * rely on the futex_wake() code removing us from hash when it - * wakes us up. - */ +out: + if (ret) + put_futex_key(fshared, &q->key); + return ret; +} - /* add_wait_queue is the barrier after __set_current_state. */ - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&q.waiter, &wait); - /* - * !plist_node_empty() is safe here without any lock. - * q.lock_ptr != 0 is not safe, because of ordering against wakeup. - */ - if (likely(!plist_node_empty(&q.list))) { - if (!abs_time) - schedule(); - else { - hrtimer_init_on_stack(&t.timer, - clockrt ? CLOCK_REALTIME : - CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(&t, current); - hrtimer_set_expires_range_ns(&t.timer, *abs_time, - current->timer_slack_ns); - - hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&t.timer)) - t.task = NULL; +static int futex_wait(u32 __user *uaddr, int fshared, + u32 val, ktime_t *abs_time, u32 bitset, int clockrt) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct restart_block *restart; + struct futex_hash_bucket *hb; + struct futex_q q; + int ret; - /* - * the timer could have already expired, in which - * case current would be flagged for rescheduling. - * Don't bother calling schedule. - */ - if (likely(t.task)) - schedule(); + if (!bitset) + return -EINVAL; - hrtimer_cancel(&t.timer); + q.pi_state = NULL; + q.bitset = bitset; + q.rt_waiter = NULL; - /* Flag if a timeout occured */ - rem = (t.task == NULL); + if (abs_time) { + to = &timeout; - destroy_hrtimer_on_stack(&t.timer); - } + hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires_range_ns(&to->timer, *abs_time, + current->timer_slack_ns); } - __set_current_state(TASK_RUNNING); - /* - * NOTE: we don't remove ourselves from the waitqueue because - * we are the only user of it. - */ + /* Prepare to wait on uaddr. */ + ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + if (ret) + goto out; + + /* queue_me and wait for wakeup, timeout, or a signal. */ + futex_wait_queue_me(hb, &q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; if (!unqueue_me(&q)) goto out_put_key; ret = -ETIMEDOUT; - if (rem) + if (to && !to->task) goto out_put_key; /* @@ -1268,7 +1757,7 @@ retry_private: restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; - restart->futex.flags = 0; + restart->futex.flags = FLAGS_HAS_TIMEOUT; if (fshared) restart->futex.flags |= FLAGS_SHARED; @@ -1280,6 +1769,10 @@ retry_private: out_put_key: put_futex_key(fshared, &q.key); out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } return ret; } @@ -1288,13 +1781,16 @@ static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; int fshared = 0; - ktime_t t; + ktime_t t, *tp = NULL; - t.tv64 = restart->futex.time; + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { + t.tv64 = restart->futex.time; + tp = &t; + } restart->fn = do_no_restart_syscall; if (restart->futex.flags & FLAGS_SHARED) fshared = 1; - return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, + return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, restart->futex.bitset, restart->futex.flags & FLAGS_CLOCKRT); } @@ -1310,11 +1806,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, int detect, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; - struct task_struct *curr = current; struct futex_hash_bucket *hb; - u32 uval, newval, curval; + u32 uval; struct futex_q q; - int ret, lock_taken, ownerdied = 0; + int res, ret; if (refill_pi_state_cache()) return -ENOMEM; @@ -1328,90 +1823,25 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, } q.pi_state = NULL; + q.rt_waiter = NULL; retry: q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key); + ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); if (unlikely(ret != 0)) goto out; retry_private: hb = queue_lock(&q); -retry_locked: - ret = lock_taken = 0; - - /* - * To avoid races, we attempt to take the lock here again - * (by doing a 0 -> TID atomic cmpxchg), while holding all - * the locks. It will most likely not succeed. - */ - newval = task_pid_vnr(current); - - curval = cmpxchg_futex_value_locked(uaddr, 0, newval); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - - /* - * Detect deadlocks. In case of REQUEUE_PI this is a valid - * situation and we return success to user space. - */ - if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { - ret = -EDEADLK; - goto out_unlock_put_key; - } - - /* - * Surprise - we got the lock. Just return to userspace: - */ - if (unlikely(!curval)) - goto out_unlock_put_key; - - uval = curval; - - /* - * Set the WAITERS flag, so the owner will know it has someone - * to wake at next unlock - */ - newval = curval | FUTEX_WAITERS; - - /* - * There are two cases, where a futex might have no owner (the - * owner TID is 0): OWNER_DIED. We take over the futex in this - * case. We also do an unconditional take over, when the owner - * of the futex died. - * - * This is safe as we are protected by the hash bucket lock ! - */ - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { - /* Keep the OWNER_DIED bit */ - newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current); - ownerdied = 0; - lock_taken = 1; - } - - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - if (unlikely(curval != uval)) - goto retry_locked; - - /* - * We took the lock due to owner died take over. - */ - if (unlikely(lock_taken)) - goto out_unlock_put_key; - - /* - * We dont have the lock. Look up the PI state (or create it if - * we are the first waiter): - */ - ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); - + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); if (unlikely(ret)) { switch (ret) { - + case 1: + /* We got the lock. */ + ret = 0; + goto out_unlock_put_key; + case -EFAULT: + goto uaddr_faulted; case -EAGAIN: /* * Task is exiting and we just wait for the @@ -1421,25 +1851,6 @@ retry_locked: put_futex_key(fshared, &q.key); cond_resched(); goto retry; - - case -ESRCH: - /* - * No owner found for this futex. Check if the - * OWNER_DIED bit is set to figure out whether - * this is a robust futex or not. - */ - if (get_futex_value_locked(&curval, uaddr)) - goto uaddr_faulted; - - /* - * We simply start over in case of a robust - * futex. The code above will take the futex - * and return happy. - */ - if (curval & FUTEX_OWNER_DIED) { - ownerdied = 1; - goto retry_locked; - } default: goto out_unlock_put_key; } @@ -1463,71 +1874,21 @@ retry_locked: } spin_lock(q.lock_ptr); - - if (!ret) { - /* - * Got the lock. We might not be the anticipated owner - * if we did a lock-steal - fix up the PI-state in - * that case: - */ - if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); - } else { - /* - * Catch the rare case, where the lock was released - * when we were on the way back before we locked the - * hash bucket. - */ - if (q.pi_state->owner == curr) { - /* - * Try to get the rt_mutex now. This might - * fail as some other task acquired the - * rt_mutex after we removed ourself from the - * rt_mutex waiters list. - */ - if (rt_mutex_trylock(&q.pi_state->pi_mutex)) - ret = 0; - else { - /* - * pi_state is incorrect, some other - * task did a lock steal and we - * returned due to timeout or signal - * without taking the rt_mutex. Too - * late. We can access the - * rt_mutex_owner without locking, as - * the other task is now blocked on - * the hash bucket lock. Fix the state - * up. - */ - struct task_struct *owner; - int res; - - owner = rt_mutex_owner(&q.pi_state->pi_mutex); - res = fixup_pi_state_owner(uaddr, &q, owner, - fshared); - - /* propagate -EFAULT, if the fixup failed */ - if (res) - ret = res; - } - } else { - /* - * Paranoia check. If we did not take the lock - * in the trylock above, then we should not be - * the owner of the rtmutex, neither the real - * nor the pending one: - */ - if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) - printk(KERN_ERR "futex_lock_pi: ret = %d " - "pi-mutex: %p pi-state %p\n", ret, - q.pi_state->pi_mutex.owner, - q.pi_state->owner); - } - } + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr, fshared, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it acquired + * the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; /* - * If fixup_pi_state_owner() faulted and was unable to handle the - * fault, unlock it and return the fault to userspace. + * If fixup_owner() faulted and was unable to handle the fault, unlock + * it and return the fault to userspace. */ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) rt_mutex_unlock(&q.pi_state->pi_mutex); @@ -1535,9 +1896,7 @@ retry_locked: /* Unqueue and drop the lock */ unqueue_me_pi(&q); - if (to) - destroy_hrtimer_on_stack(&to->timer); - return ret != -EINTR ? ret : -ERESTARTNOINTR; + goto out; out_unlock_put_key: queue_unlock(&q, hb); @@ -1547,7 +1906,7 @@ out_put_key: out: if (to) destroy_hrtimer_on_stack(&to->timer); - return ret; + return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: /* @@ -1570,7 +1929,6 @@ uaddr_faulted: goto retry; } - /* * Userspace attempted a TID -> 0 atomic transition, and failed. * This is the in-kernel slowpath: we look up the PI state (if any), @@ -1594,7 +1952,7 @@ retry: if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; - ret = get_futex_key(uaddr, fshared, &key); + ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE); if (unlikely(ret != 0)) goto out; @@ -1672,6 +2030,229 @@ pi_faulted: return ret; } +/** + * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex + * @hb: the hash_bucket futex_q was original enqueued on + * @q: the futex_q woken while waiting to be requeued + * @key2: the futex_key of the requeue target futex + * @timeout: the timeout associated with the wait (NULL if none) + * + * Detect if the task was woken on the initial futex as opposed to the requeue + * target futex. If so, determine if it was a timeout or a signal that caused + * the wakeup and return the appropriate error code to the caller. Must be + * called with the hb lock held. + * + * Returns + * 0 - no early wakeup detected + * <0 - -ETIMEDOUT or -ERESTARTNOINTR + */ +static inline +int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, + struct futex_q *q, union futex_key *key2, + struct hrtimer_sleeper *timeout) +{ + int ret = 0; + + /* + * With the hb lock held, we avoid races while we process the wakeup. + * We only need to hold hb (and not hb2) to ensure atomicity as the + * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. + * It can't be requeued from uaddr2 to something else since we don't + * support a PI aware source futex for requeue. + */ + if (!match_futex(&q->key, key2)) { + WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); + /* + * We were woken prior to requeue by a timeout or a signal. + * Unqueue the futex_q and determine which it was. + */ + plist_del(&q->list, &q->list.plist); + drop_futex_key_refs(&q->key); + + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + else + ret = -ERESTARTNOINTR; + } + return ret; +} + +/** + * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 + * @uaddr: the futex we initialyl wait on (non-pi) + * @fshared: whether the futexes are shared (1) or not (0). They must be + * the same type, no requeueing from private to shared, etc. + * @val: the expected value of uaddr + * @abs_time: absolute timeout + * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. + * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) + * @uaddr2: the pi futex we will take prior to returning to user-space + * + * The caller will wait on uaddr and will be requeued by futex_requeue() to + * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and + * complete the acquisition of the rt_mutex prior to returning to userspace. + * This ensures the rt_mutex maintains an owner when it has waiters; without + * one, the pi logic wouldn't know which task to boost/deboost, if there was a + * need to. + * + * We call schedule in futex_wait_queue_me() when we enqueue and return there + * via the following: + * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() + * 2) wakeup on uaddr2 after a requeue and subsequent unlock + * 3) signal (before or after requeue) + * 4) timeout (before or after requeue) + * + * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. + * + * If 2, we may then block on trying to take the rt_mutex and return via: + * 5) successful lock + * 6) signal + * 7) timeout + * 8) other lock acquisition failure + * + * If 6, we setup a restart_block with futex_lock_pi() as the function. + * + * If 4 or 7, we cleanup and return with -ETIMEDOUT. + * + * Returns: + * 0 - On success + * <0 - On error + */ +static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, + u32 val, ktime_t *abs_time, u32 bitset, + int clockrt, u32 __user *uaddr2) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct rt_mutex_waiter rt_waiter; + struct rt_mutex *pi_mutex = NULL; + struct futex_hash_bucket *hb; + union futex_key key2; + struct futex_q q; + int res, ret; + + if (!bitset) + return -EINVAL; + + if (abs_time) { + to = &timeout; + hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires_range_ns(&to->timer, *abs_time, + current->timer_slack_ns); + } + + /* + * The waiter is allocated on our stack, manipulated by the requeue + * code while we sleep on uaddr. + */ + debug_rt_mutex_init_waiter(&rt_waiter); + rt_waiter.task = NULL; + + q.pi_state = NULL; + q.bitset = bitset; + q.rt_waiter = &rt_waiter; + + key2 = FUTEX_KEY_INIT; + ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); + if (unlikely(ret != 0)) + goto out; + + /* Prepare to wait on uaddr. */ + ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + if (ret) + goto out_key2; + + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ + futex_wait_queue_me(hb, &q, to); + + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); + spin_unlock(&hb->lock); + if (ret) + goto out_put_keys; + + /* + * In order for us to be here, we know our q.key == key2, and since + * we took the hb->lock above, we also know that futex_requeue() has + * completed and we no longer have to concern ourselves with a wakeup + * race with the atomic proxy lock acquition by the requeue code. + */ + + /* Check if the requeue code acquired the second futex for us. */ + if (!q.rt_waiter) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case. + */ + if (q.pi_state && (q.pi_state->owner != current)) { + spin_lock(q.lock_ptr); + ret = fixup_pi_state_owner(uaddr2, &q, current, + fshared); + spin_unlock(q.lock_ptr); + } + } else { + /* + * We have been woken up by futex_unlock_pi(), a timeout, or a + * signal. futex_unlock_pi() will not destroy the lock_ptr nor + * the pi_state. + */ + WARN_ON(!&q.pi_state); + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); + debug_rt_mutex_free_waiter(&rt_waiter); + + spin_lock(q.lock_ptr); + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr2, fshared, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it + * acquired the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + /* Unqueue and drop the lock. */ + unqueue_me_pi(&q); + } + + /* + * If fixup_pi_state_owner() faulted and was unable to handle the + * fault, unlock the rt_mutex and return the fault to userspace. + */ + if (ret == -EFAULT) { + if (rt_mutex_owner(pi_mutex) == current) + rt_mutex_unlock(pi_mutex); + } else if (ret == -EINTR) { + /* + * We've already been requeued, but we have no way to + * restart by calling futex_lock_pi() directly. We + * could restart the syscall, but that will look at + * the user space value and return right away. So we + * drop back with EWOULDBLOCK to tell user space that + * "val" has been changed. That's the same what the + * restart of the syscall would do in + * futex_wait_setup(). + */ + ret = -EWOULDBLOCK; + } + +out_put_keys: + put_futex_key(fshared, &q.key); +out_key2: + put_futex_key(fshared, &key2); + +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret; +} + /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. @@ -1894,7 +2475,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, fshared = 1; clockrt = op & FUTEX_CLOCK_REALTIME; - if (clockrt && cmd != FUTEX_WAIT_BITSET) + if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) return -ENOSYS; switch (cmd) { @@ -1909,10 +2490,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ret = futex_wake(uaddr, fshared, val, val3); break; case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); break; case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, + 0); break; case FUTEX_WAKE_OP: ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); @@ -1929,6 +2511,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, if (futex_cmpxchg_enabled) ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); break; + case FUTEX_WAIT_REQUEUE_PI: + val3 = FUTEX_BITSET_MATCH_ANY; + ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, + clockrt, uaddr2); + break; + case FUTEX_CMP_REQUEUE_PI: + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, + 1); + break; default: ret = -ENOSYS; } @@ -1946,7 +2537,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, int cmd = op & FUTEX_CMD_MASK; if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET)) { + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; if (!timespec_valid(&ts)) @@ -1958,11 +2550,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, tp = &t; } /* - * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. + * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. */ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_WAKE_OP) + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (u32) (unsigned long) utime; return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 3394f8f..7d04780 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o -obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o +obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o obj-$(CONFIG_PM_SLEEP) += pm.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c687ba4..13c68e7 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) spin_lock(&desc->lock); mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) desc->status &= ~IRQ_INPROGRESS; out: desc->chip->eoi(irq); - desc = irq_remap_to_desc(irq, desc); spin_unlock(&desc->lock); } @@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) !desc->action)) { desc->status |= (IRQ_PENDING | IRQ_MASKED); mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); goto out_unlock; } kstat_incr_irqs_this_cpu(irq, desc); @@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) /* Start handling the irq */ if (desc->chip->ack) desc->chip->ack(irq); - desc = irq_remap_to_desc(irq, desc); /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; @@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) if (!noirqdebug) note_interrupt(irq, desc, action_ret); - if (desc->chip->eoi) { + if (desc->chip->eoi) desc->chip->eoi(irq); - desc = irq_remap_to_desc(irq, desc); - } } void @@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, /* Uninstall? */ if (handle == handle_bad_irq) { - if (desc->chip != &no_irq_chip) { + if (desc->chip != &no_irq_chip) mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); - } desc->status |= IRQ_DISABLED; desc->depth = 1; } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 26e0875..065205b 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,14 +11,15 @@ */ #include <linux/irq.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/rculist.h> #include <linux/hash.h> -#include <trace/irq.h> #include <linux/bootmem.h> +#include <trace/events/irq.h> #include "internals.h" @@ -44,7 +45,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) static void __init init_irq_default_affinity(void) { - alloc_bootmem_cpumask_var(&irq_default_affinity); + alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); cpumask_setall(irq_default_affinity); } #else @@ -81,45 +82,48 @@ static struct irq_desc irq_desc_init = { .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), }; -void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) +void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) { - int node; void *ptr; - node = cpu_to_node(cpu); - ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); + if (slab_is_available()) + ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), + GFP_ATOMIC, node); + else + ptr = alloc_bootmem_node(NODE_DATA(node), + nr * sizeof(*desc->kstat_irqs)); /* * don't overwite if can not get new one * init_copy_kstat_irqs() could still use old one */ if (ptr) { - printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", - cpu, node); + printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node); desc->kstat_irqs = ptr; } } -static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) +static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) { memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); spin_lock_init(&desc->lock); desc->irq = irq; #ifdef CONFIG_SMP - desc->cpu = cpu; + desc->node = node; #endif lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_kstat_irqs(desc, cpu, nr_cpu_ids); + init_kstat_irqs(desc, node, nr_cpu_ids); if (!desc->kstat_irqs) { printk(KERN_ERR "can not alloc kstat_irqs\n"); BUG_ON(1); } - if (!init_alloc_desc_masks(desc, cpu, false)) { + if (!alloc_desc_masks(desc, node, false)) { printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); BUG_ON(1); } - arch_init_chip_data(desc, cpu); + init_desc_masks(desc); + arch_init_chip_data(desc, node); } /* @@ -146,6 +150,7 @@ int __init early_irq_init(void) { struct irq_desc *desc; int legacy_count; + int node; int i; init_irq_default_affinity(); @@ -156,20 +161,21 @@ int __init early_irq_init(void) desc = irq_desc_legacy; legacy_count = ARRAY_SIZE(irq_desc_legacy); + node = first_online_node; /* allocate irq_desc_ptrs array based on nr_irqs */ - irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); + irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT); /* allocate based on nr_cpu_ids */ - /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */ - kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids * - sizeof(int)); + kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * + sizeof(int), GFP_NOWAIT, node); for (i = 0; i < legacy_count; i++) { desc[i].irq = i; desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - init_alloc_desc_masks(&desc[i], 0, true); + alloc_desc_masks(&desc[i], node, true); + init_desc_masks(&desc[i]); irq_desc_ptrs[i] = desc + i; } @@ -187,11 +193,10 @@ struct irq_desc *irq_to_desc(unsigned int irq) return NULL; } -struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) { struct irq_desc *desc; unsigned long flags; - int node; if (irq >= nr_irqs) { WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", @@ -210,15 +215,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) if (desc) goto out_unlock; - node = cpu_to_node(cpu); - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n", - irq, cpu, node); + if (slab_is_available()) + desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); + else + desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc)); + + printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); if (!desc) { printk(KERN_ERR "can not alloc irq_desc\n"); BUG_ON(1); } - init_one_irq_desc(irq, desc, cpu); + init_one_irq_desc(irq, desc, node); irq_desc_ptrs[irq] = desc; @@ -256,7 +263,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq = i; - init_alloc_desc_masks(&desc[i], 0, true); + alloc_desc_masks(&desc[i], 0, true); + init_desc_masks(&desc[i]); desc[i].kstat_irqs = kstat_irqs_all[i]; } return arch_early_irq_init(); @@ -267,7 +275,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) return (irq < NR_IRQS) ? irq_desc + irq : NULL; } -struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) { return irq_to_desc(irq); } @@ -348,9 +356,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -DEFINE_TRACE(irq_handler_entry); -DEFINE_TRACE(irq_handler_exit); - /** * handle_IRQ_event - irq action chain handler * @irq: the interrupt number @@ -453,11 +458,8 @@ unsigned int __do_IRQ(unsigned int irq) /* * No locking required for CPU-local interrupts: */ - if (desc->chip->ack) { + if (desc->chip->ack) desc->chip->ack(irq); - /* get new one */ - desc = irq_remap_to_desc(irq, desc); - } if (likely(!(desc->status & IRQ_DISABLED))) { action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) @@ -468,10 +470,8 @@ unsigned int __do_IRQ(unsigned int irq) } spin_lock(&desc->lock); - if (desc->chip->ack) { + if (desc->chip->ack) desc->chip->ack(irq); - desc = irq_remap_to_desc(irq, desc); - } /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 01ce20e..7346825 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); extern struct lock_class_key irq_desc_lock_class; -extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); +extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); extern void clear_kstat_irqs(struct irq_desc *desc); extern spinlock_t sparse_irq_lock; @@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq, extern int irq_select_affinity_usr(unsigned int irq); +extern void +irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask); + /* * Debugging printout: */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2734eca..aaf5c9d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq) return 1; } -static void +void irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) { struct irqaction *action = desc->action; @@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PCNTXT) - desc->chip->set_affinity(irq, cpumask); + if (desc->status & IRQ_MOVE_PCNTXT) { + if (!desc->chip->set_affinity(irq, cpumask)) { + cpumask_copy(desc->affinity, cpumask); + irq_set_thread_affinity(desc, cpumask); + } + } else { desc->status |= IRQ_MOVE_PENDING; cpumask_copy(desc->pending_mask, cpumask); } #else - cpumask_copy(desc->affinity, cpumask); - desc->chip->set_affinity(irq, cpumask); + if (!desc->chip->set_affinity(irq, cpumask)) { + cpumask_copy(desc->affinity, cpumask); + irq_set_thread_affinity(desc, cpumask); + } #endif - irq_set_thread_affinity(desc, cpumask); desc->status |= IRQ_AFFINITY_SET; spin_unlock_irqrestore(&desc->lock, flags); return 0; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index e05ad9b..cfe767c 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -1,5 +1,8 @@ #include <linux/irq.h> +#include <linux/interrupt.h> + +#include "internals.h" void move_masked_irq(int irq) { @@ -39,11 +42,12 @@ void move_masked_irq(int irq) * masking the irqs. */ if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) - < nr_cpu_ids)) { - cpumask_and(desc->affinity, - desc->pending_mask, cpu_online_mask); - desc->chip->set_affinity(irq, desc->affinity); - } + < nr_cpu_ids)) + if (!desc->chip->set_affinity(irq, desc->pending_mask)) { + cpumask_copy(desc->affinity, desc->pending_mask); + irq_set_thread_affinity(desc, desc->pending_mask); + } + cpumask_clear(desc->pending_mask); } diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 44bbdcb..2f69bee 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -15,9 +15,9 @@ static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc, - int cpu, int nr) + int node, int nr) { - init_kstat_irqs(desc, cpu, nr); + init_kstat_irqs(desc, node, nr); if (desc->kstat_irqs != old_desc->kstat_irqs) memcpy(desc->kstat_irqs, old_desc->kstat_irqs, @@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) } static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, - struct irq_desc *desc, int cpu) + struct irq_desc *desc, int node) { memcpy(desc, old_desc, sizeof(struct irq_desc)); - if (!init_alloc_desc_masks(desc, cpu, false)) { + if (!alloc_desc_masks(desc, node, false)) { printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " "for migration.\n", irq); return false; } spin_lock_init(&desc->lock); - desc->cpu = cpu; + desc->node = node; lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); + init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); init_copy_desc_masks(old_desc, desc); - arch_init_copy_chip_data(old_desc, desc, cpu); + arch_init_copy_chip_data(old_desc, desc, node); return true; } @@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) } static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, - int cpu) + int node) { struct irq_desc *desc; unsigned int irq; unsigned long flags; - int node; irq = old_desc->irq; @@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, if (desc && old_desc != desc) goto out_unlock; - node = cpu_to_node(cpu); desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); if (!desc) { printk(KERN_ERR "irq %d: can not get new irq_desc " @@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, desc = old_desc; goto out_unlock; } - if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { + if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) { /* still use old one */ kfree(desc); desc = old_desc; @@ -97,9 +95,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, /* free the old one */ free_one_irq_desc(old_desc, desc); - spin_unlock(&old_desc->lock); kfree(old_desc); - spin_lock(&desc->lock); return desc; @@ -109,24 +105,14 @@ out_unlock: return desc; } -struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) +struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) { - int old_cpu; - int node, old_node; - /* those all static, do move them */ if (desc->irq < NR_IRQS_LEGACY) return desc; - old_cpu = desc->cpu; - if (old_cpu != cpu) { - node = cpu_to_node(cpu); - old_node = cpu_to_node(old_cpu); - if (old_node != node) - desc = __real_move_irq_desc(desc, cpu); - else - desc->cpu = cpu; - } + if (desc->node != node) + desc = __real_move_irq_desc(desc, node); return desc; } diff --git a/kernel/kexec.c b/kernel/kexec.c index 5a758c6e..ae1c352 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1448,18 +1448,17 @@ int kernel_kexec(void) goto Restore_console; } suspend_console(); - error = device_suspend(PMSG_FREEZE); + error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Resume_console; - device_pm_lock(); - /* At this point, device_suspend() has been called, - * but *not* device_power_down(). We *must* - * device_power_down() now. Otherwise, drivers for + /* At this point, dpm_suspend_start() has been called, + * but *not* dpm_suspend_noirq(). We *must* call + * dpm_suspend_noirq() now. Otherwise, drivers for * some devices (e.g. interrupt controllers) become * desynchronized with the actual state of the * hardware at resume time, and evil weirdness ensues. */ - error = device_power_down(PMSG_FREEZE); + error = dpm_suspend_noirq(PMSG_FREEZE); if (error) goto Resume_devices; error = disable_nonboot_cpus(); @@ -1487,10 +1486,9 @@ int kernel_kexec(void) local_irq_enable(); Enable_cpus: enable_nonboot_cpus(); - device_power_up(PMSG_RESTORE); + dpm_resume_noirq(PMSG_RESTORE); Resume_devices: - device_pm_unlock(); - device_resume(PMSG_RESTORE); + dpm_resume_end(PMSG_RESTORE); Resume_console: resume_console(); thaw_processes(); diff --git a/kernel/kgdb.c b/kernel/kgdb.c index e4dcfb2..9147a31 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -1583,8 +1583,8 @@ static void sysrq_handle_gdb(int key, struct tty_struct *tty) static struct sysrq_key_op sysrq_gdb_op = { .handler = sysrq_handle_gdb, - .help_msg = "Gdb", - .action_msg = "GDB", + .help_msg = "debug(G)", + .action_msg = "DEBUG", }; #endif diff --git a/kernel/kmod.c b/kernel/kmod.c index b750675..7e95bed 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -370,8 +370,10 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, sub_info->argv = argv; sub_info->envp = envp; sub_info->cred = prepare_usermodehelper_creds(); - if (!sub_info->cred) + if (!sub_info->cred) { + kfree(sub_info); return NULL; + } out: return sub_info; diff --git a/kernel/kthread.c b/kernel/kthread.c index 4ebaf85..41c88fe 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -13,7 +13,7 @@ #include <linux/file.h> #include <linux/module.h> #include <linux/mutex.h> -#include <trace/sched.h> +#include <trace/events/sched.h> #define KTHREAD_NICE_LEVEL (-5) @@ -21,9 +21,6 @@ static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; -DEFINE_TRACE(sched_kthread_stop); -DEFINE_TRACE(sched_kthread_stop_ret); - struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index accb40c..8bbeef9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -42,12 +42,14 @@ #include <linux/hash.h> #include <linux/ftrace.h> #include <linux/stringify.h> -#include <trace/lockdep.h> #include <asm/sections.h> #include "lockdep_internals.h" +#define CREATE_TRACE_POINTS +#include <trace/events/lockdep.h> + #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; module_param(prove_locking, int, 0644); @@ -2935,8 +2937,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name, } EXPORT_SYMBOL_GPL(lock_set_class); -DEFINE_TRACE(lock_acquire); - /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -2963,8 +2963,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, } EXPORT_SYMBOL_GPL(lock_acquire); -DEFINE_TRACE(lock_release); - void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { @@ -3105,6 +3103,8 @@ found_it: hlock->holdtime_stamp = now; } + trace_lock_acquired(lock, ip, waittime); + stats = get_lock_stats(hlock_class(hlock)); if (waittime) { if (hlock->read) @@ -3120,8 +3120,6 @@ found_it: lock->ip = ip; } -DEFINE_TRACE(lock_contended); - void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; @@ -3143,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_contended); -DEFINE_TRACE(lock_acquired); - void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - trace_lock_acquired(lock, ip); - if (unlikely(!lock_stat)) return; diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index a2cc7e9..699a2ac 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -54,9 +54,9 @@ enum { * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ -#define MAX_LOCKDEP_ENTRIES 8192UL +#define MAX_LOCKDEP_ENTRIES 16384UL -#define MAX_LOCKDEP_CHAINS_BITS 14 +#define MAX_LOCKDEP_CHAINS_BITS 15 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) diff --git a/kernel/module.c b/kernel/module.c index e797812..e4ab36c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -18,6 +18,7 @@ */ #include <linux/module.h> #include <linux/moduleloader.h> +#include <linux/ftrace_event.h> #include <linux/init.h> #include <linux/kallsyms.h> #include <linux/fs.h> @@ -52,6 +53,7 @@ #include <linux/ftrace.h> #include <linux/async.h> #include <linux/percpu.h> +#include <linux/kmemleak.h> #if 0 #define DEBUGP printk @@ -72,6 +74,9 @@ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); +/* Block module loading/unloading? */ +int modules_disabled = 0; + /* Waiting for a module to finish initializing? */ static DECLARE_WAIT_QUEUE_HEAD(module_wq); @@ -429,6 +434,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align, unsigned long extra; unsigned int i; void *ptr; + int cpu; if (align > PAGE_SIZE) { printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", @@ -458,6 +464,11 @@ static void *percpu_modalloc(unsigned long size, unsigned long align, if (!split_block(i, size)) return NULL; + /* add the per-cpu scanning areas */ + for_each_possible_cpu(cpu) + kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0, + GFP_KERNEL); + /* Mark allocated */ pcpu_size[i] = -pcpu_size[i]; return ptr; @@ -472,6 +483,7 @@ static void percpu_modfree(void *freeme) { unsigned int i; void *ptr = __per_cpu_start + block_size(pcpu_size[0]); + int cpu; /* First entry is core kernel percpu data. */ for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { @@ -483,6 +495,10 @@ static void percpu_modfree(void *freeme) BUG(); free: + /* remove the per-cpu scanning areas */ + for_each_possible_cpu(cpu) + kmemleak_free(freeme + per_cpu_offset(cpu)); + /* Merge with previous? */ if (pcpu_size[i-1] >= 0) { pcpu_size[i-1] += pcpu_size[i]; @@ -777,7 +793,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, char name[MODULE_NAME_LEN]; int ret, forced = 0; - if (!capable(CAP_SYS_MODULE)) + if (!capable(CAP_SYS_MODULE) || modules_disabled) return -EPERM; if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) @@ -1489,9 +1505,6 @@ static void free_module(struct module *mod) /* Free any allocated parameters. */ destroy_params(mod->kp, mod->num_kp); - /* release any pointers to mcount in this module */ - ftrace_release(mod->module_core, mod->core_size); - /* This may be NULL, but that's OK */ module_free(mod, mod->module_init); kfree(mod->args); @@ -1878,6 +1891,36 @@ static void *module_alloc_update_bounds(unsigned long size) return ret; } +#ifdef CONFIG_DEBUG_KMEMLEAK +static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, char *secstrings) +{ + unsigned int i; + + /* only scan the sections containing data */ + kmemleak_scan_area(mod->module_core, (unsigned long)mod - + (unsigned long)mod->module_core, + sizeof(struct module), GFP_KERNEL); + + for (i = 1; i < hdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0 + && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) + continue; + + kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr - + (unsigned long)mod->module_core, + sechdrs[i].sh_size, GFP_KERNEL); + } +} +#else +static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, char *secstrings) +{ +} +#endif + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ static noinline struct module *load_module(void __user *umod, @@ -1892,11 +1935,9 @@ static noinline struct module *load_module(void __user *umod, unsigned int symindex = 0; unsigned int strindex = 0; unsigned int modindex, versindex, infoindex, pcpuindex; - unsigned int num_mcount; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ - unsigned long *mseg; mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2050,6 +2091,12 @@ static noinline struct module *load_module(void __user *umod, /* Do the allocs. */ ptr = module_alloc_update_bounds(mod->core_size); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. Just mark it as not being a + * leak. + */ + kmemleak_not_leak(ptr); if (!ptr) { err = -ENOMEM; goto free_percpu; @@ -2058,6 +2105,13 @@ static noinline struct module *load_module(void __user *umod, mod->module_core = ptr; ptr = module_alloc_update_bounds(mod->init_size); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. This block doesn't need to be + * scanned as it contains data and code that will be freed + * after the module is initialized. + */ + kmemleak_ignore(ptr); if (!ptr && mod->init_size) { err = -ENOMEM; goto free_core; @@ -2088,6 +2142,7 @@ static noinline struct module *load_module(void __user *umod, } /* Module has been moved. */ mod = (void *)sechdrs[modindex].sh_addr; + kmemleak_load_module(mod, hdr, sechdrs, secstrings); #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), @@ -2172,7 +2227,19 @@ static noinline struct module *load_module(void __user *umod, sizeof(*mod->tracepoints), &mod->num_tracepoints); #endif - +#ifdef CONFIG_EVENT_TRACING + mod->trace_events = section_objs(hdr, sechdrs, secstrings, + "_ftrace_events", + sizeof(*mod->trace_events), + &mod->num_trace_events); +#endif +#ifdef CONFIG_FTRACE_MCOUNT_RECORD + /* sechdrs[0].sh_size is always zero */ + mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings, + "__mcount_loc", + sizeof(*mod->ftrace_callsites), + &mod->num_ftrace_callsites); +#endif #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -2237,11 +2304,6 @@ static noinline struct module *load_module(void __user *umod, dynamic_debug_setup(debug, num_debug); } - /* sechdrs[0].sh_size is always zero */ - mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", - sizeof(*mseg), &num_mcount); - ftrace_init_module(mod, mseg, mseg + num_mcount); - err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; @@ -2302,7 +2364,6 @@ static noinline struct module *load_module(void __user *umod, cleanup: kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); - ftrace_release(mod->module_core, mod->core_size); free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) @@ -2336,7 +2397,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, int ret = 0; /* Must have permission */ - if (!capable(CAP_SYS_MODULE)) + if (!capable(CAP_SYS_MODULE) || modules_disabled) return -EPERM; /* Only one module load at a time, please */ @@ -2394,6 +2455,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); + trim_init_extable(mod); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; diff --git a/kernel/mutex.c b/kernel/mutex.c index 507cf2b..947b3ad 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count); * * This function is similar to (but not equivalent to) down(). */ -void inline __sched mutex_lock(struct mutex *lock) +void __sched mutex_lock(struct mutex *lock) { might_sleep(); /* @@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* didnt get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - __schedule(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); spin_lock_mutex(&lock->wait_lock, flags); } @@ -471,5 +473,28 @@ int __sched mutex_trylock(struct mutex *lock) return ret; } - EXPORT_SYMBOL(mutex_trylock); + +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ + /* dec if we can't possibly hit 0 */ + if (atomic_add_unless(cnt, -1, 1)) + return 0; + /* we might hit 0, so take the lock */ + mutex_lock(lock); + if (!atomic_dec_and_test(cnt)) { + /* when we actually did the dec, we didn't hit 0 */ + mutex_unlock(lock); + return 0; + } + /* we hit 0, and we hold the lock */ + return 1; +} +EXPORT_SYMBOL(atomic_dec_and_mutex_lock); diff --git a/kernel/panic.c b/kernel/panic.c index 874ecf1..984b3ec 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -340,39 +340,44 @@ void oops_exit(void) } #ifdef WANT_WARN_ON_SLOWPATH -void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) -{ +struct slowpath_args { + const char *fmt; va_list args; - char function[KSYM_SYMBOL_LEN]; - unsigned long caller = (unsigned long)__builtin_return_address(0); - const char *board; +}; - sprint_symbol(function, caller); +static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args) +{ + const char *board; printk(KERN_WARNING "------------[ cut here ]------------\n"); - printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, - line, function); + printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); board = dmi_get_system_info(DMI_PRODUCT_NAME); if (board) printk(KERN_WARNING "Hardware name: %s\n", board); - if (*fmt) { - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); - } + if (args) + vprintk(args->fmt, args->args); print_modules(); dump_stack(); print_oops_end_marker(); add_taint(TAINT_WARN); } + +void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) +{ + struct slowpath_args args; + + args.fmt = fmt; + va_start(args.args, fmt); + warn_slowpath_common(file, line, __builtin_return_address(0), &args); + va_end(args.args); +} EXPORT_SYMBOL(warn_slowpath_fmt); void warn_slowpath_null(const char *file, int line) { - static const char *empty = ""; - warn_slowpath_fmt(file, line, empty); + warn_slowpath_common(file, line, __builtin_return_address(0), NULL); } EXPORT_SYMBOL(warn_slowpath_null); #endif diff --git a/kernel/params.c b/kernel/params.c index de273ec..7f6912c 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -24,9 +24,6 @@ #include <linux/err.h> #include <linux/slab.h> -/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */ -#define KPARAM_KMALLOCED 0x80000000 - #if 0 #define DEBUGP printk #else @@ -220,13 +217,13 @@ int param_set_charp(const char *val, struct kernel_param *kp) return -ENOSPC; } - if (kp->perm & KPARAM_KMALLOCED) + if (kp->flags & KPARAM_KMALLOCED) kfree(*(char **)kp->arg); /* This is a hack. We can't need to strdup in early boot, and we * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { - kp->perm |= KPARAM_KMALLOCED; + kp->flags |= KPARAM_KMALLOCED; *(char **)kp->arg = kstrdup(val, GFP_KERNEL); if (!kp->arg) return -ENOMEM; @@ -241,44 +238,63 @@ int param_get_charp(char *buffer, struct kernel_param *kp) return sprintf(buffer, "%s", *((char **)kp->arg)); } +/* Actually could be a bool or an int, for historical reasons. */ int param_set_bool(const char *val, struct kernel_param *kp) { + bool v; + /* No equals means "set"... */ if (!val) val = "1"; /* One of =[yYnN01] */ switch (val[0]) { case 'y': case 'Y': case '1': - *(int *)kp->arg = 1; - return 0; + v = true; + break; case 'n': case 'N': case '0': - *(int *)kp->arg = 0; - return 0; + v = false; + break; + default: + return -EINVAL; } - return -EINVAL; + + if (kp->flags & KPARAM_ISBOOL) + *(bool *)kp->arg = v; + else + *(int *)kp->arg = v; + return 0; } int param_get_bool(char *buffer, struct kernel_param *kp) { + bool val; + if (kp->flags & KPARAM_ISBOOL) + val = *(bool *)kp->arg; + else + val = *(int *)kp->arg; + /* Y and N chosen as being relatively non-coder friendly */ - return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N'); + return sprintf(buffer, "%c", val ? 'Y' : 'N'); } +/* This one must be bool. */ int param_set_invbool(const char *val, struct kernel_param *kp) { - int boolval, ret; + int ret; + bool boolval; struct kernel_param dummy; dummy.arg = &boolval; + dummy.flags = KPARAM_ISBOOL; ret = param_set_bool(val, &dummy); if (ret == 0) - *(int *)kp->arg = !boolval; + *(bool *)kp->arg = !boolval; return ret; } int param_get_invbool(char *buffer, struct kernel_param *kp) { - return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y'); + return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); } /* We break the rule and mangle the string. */ @@ -591,7 +607,7 @@ void destroy_params(const struct kernel_param *params, unsigned num) unsigned int i; for (i = 0; i < num; i++) - if (params[i].perm & KPARAM_KMALLOCED) + if (params[i].flags & KPARAM_KMALLOCED) kfree(*(char **)params[i].arg); } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c new file mode 100644 index 0000000..29b685f --- /dev/null +++ b/kernel/perf_counter.c @@ -0,0 +1,4339 @@ +/* + * Performance counter core code + * + * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * For licensing details see kernel-base/COPYING + */ + +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/file.h> +#include <linux/poll.h> +#include <linux/sysfs.h> +#include <linux/dcache.h> +#include <linux/percpu.h> +#include <linux/ptrace.h> +#include <linux/vmstat.h> +#include <linux/hardirq.h> +#include <linux/rculist.h> +#include <linux/uaccess.h> +#include <linux/syscalls.h> +#include <linux/anon_inodes.h> +#include <linux/kernel_stat.h> +#include <linux/perf_counter.h> + +#include <asm/irq_regs.h> + +/* + * Each CPU has a list of per CPU counters: + */ +DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +int perf_max_counters __read_mostly = 1; +static int perf_reserved_percpu __read_mostly; +static int perf_overcommit __read_mostly = 1; + +static atomic_t nr_counters __read_mostly; +static atomic_t nr_mmap_counters __read_mostly; +static atomic_t nr_comm_counters __read_mostly; + +/* + * perf counter paranoia level: + * 0 - not paranoid + * 1 - disallow cpu counters to unpriv + * 2 - disallow kernel profiling to unpriv + */ +int sysctl_perf_counter_paranoid __read_mostly; + +static inline bool perf_paranoid_cpu(void) +{ + return sysctl_perf_counter_paranoid > 0; +} + +static inline bool perf_paranoid_kernel(void) +{ + return sysctl_perf_counter_paranoid > 1; +} + +int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ + +/* + * max perf counter sample rate + */ +int sysctl_perf_counter_sample_rate __read_mostly = 100000; + +static atomic64_t perf_counter_id; + +/* + * Lock for (sysadmin-configurable) counter reservations: + */ +static DEFINE_SPINLOCK(perf_resource_lock); + +/* + * Architecture provided APIs - weak aliases: + */ +extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter) +{ + return NULL; +} + +void __weak hw_perf_disable(void) { barrier(); } +void __weak hw_perf_enable(void) { barrier(); } + +void __weak hw_perf_counter_setup(int cpu) { barrier(); } + +int __weak +hw_perf_group_sched_in(struct perf_counter *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, int cpu) +{ + return 0; +} + +void __weak perf_counter_print_debug(void) { } + +static DEFINE_PER_CPU(int, disable_count); + +void __perf_disable(void) +{ + __get_cpu_var(disable_count)++; +} + +bool __perf_enable(void) +{ + return !--__get_cpu_var(disable_count); +} + +void perf_disable(void) +{ + __perf_disable(); + hw_perf_disable(); +} + +void perf_enable(void) +{ + if (__perf_enable()) + hw_perf_enable(); +} + +static void get_ctx(struct perf_counter_context *ctx) +{ + atomic_inc(&ctx->refcount); +} + +static void free_ctx(struct rcu_head *head) +{ + struct perf_counter_context *ctx; + + ctx = container_of(head, struct perf_counter_context, rcu_head); + kfree(ctx); +} + +static void put_ctx(struct perf_counter_context *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) { + if (ctx->parent_ctx) + put_ctx(ctx->parent_ctx); + if (ctx->task) + put_task_struct(ctx->task); + call_rcu(&ctx->rcu_head, free_ctx); + } +} + +/* + * Get the perf_counter_context for a task and lock it. + * This has to cope with with the fact that until it is locked, + * the context could get moved to another task. + */ +static struct perf_counter_context * +perf_lock_task_context(struct task_struct *task, unsigned long *flags) +{ + struct perf_counter_context *ctx; + + rcu_read_lock(); + retry: + ctx = rcu_dereference(task->perf_counter_ctxp); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_counter_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more. + */ + spin_lock_irqsave(&ctx->lock, *flags); + if (ctx != rcu_dereference(task->perf_counter_ctxp)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + goto retry; + } + } + rcu_read_unlock(); + return ctx; +} + +/* + * Get the context for a task and increment its pin_count so it + * can't get swapped to another task. This also increments its + * reference count so that the context can't get freed. + */ +static struct perf_counter_context *perf_pin_task_context(struct task_struct *task) +{ + struct perf_counter_context *ctx; + unsigned long flags; + + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + ++ctx->pin_count; + get_ctx(ctx); + spin_unlock_irqrestore(&ctx->lock, flags); + } + return ctx; +} + +static void perf_unpin_context(struct perf_counter_context *ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->lock, flags); + --ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); + put_ctx(ctx); +} + +/* + * Add a counter from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) +{ + struct perf_counter *group_leader = counter->group_leader; + + /* + * Depending on whether it is a standalone or sibling counter, + * add it straight to the context's counter list, or to the group + * leader's sibling list: + */ + if (group_leader == counter) + list_add_tail(&counter->list_entry, &ctx->counter_list); + else { + list_add_tail(&counter->list_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; + } + + list_add_rcu(&counter->event_entry, &ctx->event_list); + ctx->nr_counters++; +} + +/* + * Remove a counter from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) +{ + struct perf_counter *sibling, *tmp; + + if (list_empty(&counter->list_entry)) + return; + ctx->nr_counters--; + + list_del_init(&counter->list_entry); + list_del_rcu(&counter->event_entry); + + if (counter->group_leader != counter) + counter->group_leader->nr_siblings--; + + /* + * If this was a group counter with sibling counters then + * upgrade the siblings to singleton counters by adding them + * to the context list directly: + */ + list_for_each_entry_safe(sibling, tmp, + &counter->sibling_list, list_entry) { + + list_move_tail(&sibling->list_entry, &ctx->counter_list); + sibling->group_leader = sibling; + } +} + +static void +counter_sched_out(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx) +{ + if (counter->state != PERF_COUNTER_STATE_ACTIVE) + return; + + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_stopped = ctx->time; + counter->pmu->disable(counter); + counter->oncpu = -1; + + if (!is_software_counter(counter)) + cpuctx->active_oncpu--; + ctx->nr_active--; + if (counter->attr.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + +static void +group_sched_out(struct perf_counter *group_counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx) +{ + struct perf_counter *counter; + + if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) + return; + + counter_sched_out(group_counter, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) + counter_sched_out(counter, cpuctx, ctx); + + if (group_counter->attr.exclusive) + cpuctx->exclusive = 0; +} + +/* + * Cross CPU call to remove a performance counter + * + * We disable the counter on the hardware level first. After that we + * remove it from the context list. + */ +static void __perf_counter_remove_from_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + /* + * Protect the list operation against NMI by disabling the + * counters on a global level. + */ + perf_disable(); + + counter_sched_out(counter, cpuctx, ctx); + + list_del_counter(counter, ctx); + + if (!ctx->task) { + /* + * Allow more per task counters with respect to the + * reservation: + */ + cpuctx->max_pertask = + min(perf_max_counters - ctx->nr_counters, + perf_max_counters - perf_reserved_percpu); + } + + perf_enable(); + spin_unlock(&ctx->lock); +} + + +/* + * Remove the counter from a task's (or a CPU's) list of counters. + * + * Must be called with ctx->mutex held. + * + * CPU counters are removed with a smp call. For task counters we only + * call when the task is on a CPU. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid. This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_counter_exit_task, it's OK because the + * context has been detached from its task. + */ +static void perf_counter_remove_from_context(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu counters are removed via an smp call and + * the removal is always sucessful. + */ + smp_call_function_single(counter->cpu, + __perf_counter_remove_from_context, + counter, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_counter_remove_from_context, + counter); + + spin_lock_irq(&ctx->lock); + /* + * If the context is active we need to retry the smp call. + */ + if (ctx->nr_active && !list_empty(&counter->list_entry)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can remove the counter safely, if the call above did not + * succeed. + */ + if (!list_empty(&counter->list_entry)) { + list_del_counter(counter, ctx); + } + spin_unlock_irq(&ctx->lock); +} + +static inline u64 perf_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_counter_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +/* + * Update the total_time_enabled and total_time_running fields for a counter. + */ +static void update_counter_times(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + u64 run_end; + + if (counter->state < PERF_COUNTER_STATE_INACTIVE) + return; + + counter->total_time_enabled = ctx->time - counter->tstamp_enabled; + + if (counter->state == PERF_COUNTER_STATE_INACTIVE) + run_end = counter->tstamp_stopped; + else + run_end = ctx->time; + + counter->total_time_running = run_end - counter->tstamp_running; +} + +/* + * Update total_time_enabled and total_time_running for all counters in a group. + */ +static void update_group_times(struct perf_counter *leader) +{ + struct perf_counter *counter; + + update_counter_times(leader); + list_for_each_entry(counter, &leader->sibling_list, list_entry) + update_counter_times(counter); +} + +/* + * Cross CPU call to disable a performance counter + */ +static void __perf_counter_disable(void *info) +{ + struct perf_counter *counter = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter_context *ctx = counter->ctx; + + /* + * If this is a per-task counter, need to check whether this + * counter's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + + /* + * If the counter is on, turn it off. + * If it is in error state, leave it in error state. + */ + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { + update_context_time(ctx); + update_counter_times(counter); + if (counter == counter->group_leader) + group_sched_out(counter, cpuctx, ctx); + else + counter_sched_out(counter, cpuctx, ctx); + counter->state = PERF_COUNTER_STATE_OFF; + } + + spin_unlock(&ctx->lock); +} + +/* + * Disable a counter. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid. This condition is satisifed when called through + * perf_counter_for_each_child or perf_counter_for_each because they + * hold the top-level counter's child_mutex, so any descendant that + * goes to exit will block in sync_child_counter. + * When called from perf_pending_counter it's OK because counter->ctx + * is the current context on this CPU and preemption is disabled, + * hence we can't get into perf_counter_task_sched_out for this context. + */ +static void perf_counter_disable(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Disable the counter on the cpu that it's on + */ + smp_call_function_single(counter->cpu, __perf_counter_disable, + counter, 1); + return; + } + + retry: + task_oncpu_function_call(task, __perf_counter_disable, counter); + + spin_lock_irq(&ctx->lock); + /* + * If the counter is still active, we need to retry the cross-call. + */ + if (counter->state == PERF_COUNTER_STATE_ACTIVE) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_counter_times(counter); + counter->state = PERF_COUNTER_STATE_OFF; + } + + spin_unlock_irq(&ctx->lock); +} + +static int +counter_sched_in(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, + int cpu) +{ + if (counter->state <= PERF_COUNTER_STATE_OFF) + return 0; + + counter->state = PERF_COUNTER_STATE_ACTIVE; + counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + /* + * The new state must be visible before we turn it on in the hardware: + */ + smp_wmb(); + + if (counter->pmu->enable(counter)) { + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->oncpu = -1; + return -EAGAIN; + } + + counter->tstamp_running += ctx->time - counter->tstamp_stopped; + + if (!is_software_counter(counter)) + cpuctx->active_oncpu++; + ctx->nr_active++; + + if (counter->attr.exclusive) + cpuctx->exclusive = 1; + + return 0; +} + +static int +group_sched_in(struct perf_counter *group_counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, + int cpu) +{ + struct perf_counter *counter, *partial_group; + int ret; + + if (group_counter->state == PERF_COUNTER_STATE_OFF) + return 0; + + ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); + if (ret) + return ret < 0 ? ret : 0; + + if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) + return -EAGAIN; + + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { + if (counter_sched_in(counter, cpuctx, ctx, cpu)) { + partial_group = counter; + goto group_error; + } + } + + return 0; + +group_error: + /* + * Groups can be scheduled in as one unit only, so undo any + * partial group before returning: + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { + if (counter == partial_group) + break; + counter_sched_out(counter, cpuctx, ctx); + } + counter_sched_out(group_counter, cpuctx, ctx); + + return -EAGAIN; +} + +/* + * Return 1 for a group consisting entirely of software counters, + * 0 if the group contains any hardware counters. + */ +static int is_software_only_group(struct perf_counter *leader) +{ + struct perf_counter *counter; + + if (!is_software_counter(leader)) + return 0; + + list_for_each_entry(counter, &leader->sibling_list, list_entry) + if (!is_software_counter(counter)) + return 0; + + return 1; +} + +/* + * Work out whether we can put this counter group on the CPU now. + */ +static int group_can_go_on(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + int can_add_hw) +{ + /* + * Groups consisting entirely of software counters can always go on. + */ + if (is_software_only_group(counter)) + return 1; + /* + * If an exclusive group is already on, no other hardware + * counters can go on. + */ + if (cpuctx->exclusive) + return 0; + /* + * If this group is exclusive and there are already + * counters on the CPU, it can't go on. + */ + if (counter->attr.exclusive && cpuctx->active_oncpu) + return 0; + /* + * Otherwise, try to add it if all previous groups were able + * to go on. + */ + return can_add_hw; +} + +static void add_counter_to_ctx(struct perf_counter *counter, + struct perf_counter_context *ctx) +{ + list_add_counter(counter, ctx); + counter->tstamp_enabled = ctx->time; + counter->tstamp_running = ctx->time; + counter->tstamp_stopped = ctx->time; +} + +/* + * Cross CPU call to install and enable a performance counter + * + * Must be called with ctx->mutex held + */ +static void __perf_install_in_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; + struct perf_counter *leader = counter->group_leader; + int cpu = smp_processor_id(); + int err; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + * Or possibly this is the right context but it isn't + * on this cpu because it had no counters. + */ + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } + + spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + + /* + * Protect the list operation against NMI by disabling the + * counters on a global level. NOP for non NMI based counters. + */ + perf_disable(); + + add_counter_to_ctx(counter, ctx); + + /* + * Don't put the counter on if it is disabled or if + * it is in a group and the group isn't on. + */ + if (counter->state != PERF_COUNTER_STATE_INACTIVE || + (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) + goto unlock; + + /* + * An exclusive counter can't go on if there are already active + * hardware counters, and no hardware counter can go on if there + * is already an exclusive counter on. + */ + if (!group_can_go_on(counter, cpuctx, 1)) + err = -EEXIST; + else + err = counter_sched_in(counter, cpuctx, ctx, cpu); + + if (err) { + /* + * This counter couldn't go on. If it is in a group + * then we have to pull the whole group off. + * If the counter group is pinned then put it in error state. + */ + if (leader != counter) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_COUNTER_STATE_ERROR; + } + } + + if (!err && !ctx->task && cpuctx->max_pertask) + cpuctx->max_pertask--; + + unlock: + perf_enable(); + + spin_unlock(&ctx->lock); +} + +/* + * Attach a performance counter to a context + * + * First we add the counter to the list with the hardware enable bit + * in counter->hw_config cleared. + * + * If the counter is attached to a task which is on a CPU we use a smp + * call to enable it in the task context. The task might have been + * scheduled away, but we check this in the smp call again. + * + * Must be called with ctx->mutex held. + */ +static void +perf_install_in_context(struct perf_counter_context *ctx, + struct perf_counter *counter, + int cpu) +{ + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu counters are installed via an smp call and + * the install is always sucessful. + */ + smp_call_function_single(cpu, __perf_install_in_context, + counter, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_install_in_context, + counter); + + spin_lock_irq(&ctx->lock); + /* + * we need to retry the smp call. + */ + if (ctx->is_active && list_empty(&counter->list_entry)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can add the counter safely, if it the call above did not + * succeed. + */ + if (list_empty(&counter->list_entry)) + add_counter_to_ctx(counter, ctx); + spin_unlock_irq(&ctx->lock); +} + +/* + * Cross CPU call to enable a performance counter + */ +static void __perf_counter_enable(void *info) +{ + struct perf_counter *counter = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter_context *ctx = counter->ctx; + struct perf_counter *leader = counter->group_leader; + int err; + + /* + * If this is a per-task counter, need to check whether this + * counter's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } + + spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + goto unlock; + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = ctx->time - counter->total_time_enabled; + + /* + * If the counter is in a group and isn't the group leader, + * then don't put it on unless the group is on. + */ + if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) + goto unlock; + + if (!group_can_go_on(counter, cpuctx, 1)) { + err = -EEXIST; + } else { + perf_disable(); + if (counter == leader) + err = group_sched_in(counter, cpuctx, ctx, + smp_processor_id()); + else + err = counter_sched_in(counter, cpuctx, ctx, + smp_processor_id()); + perf_enable(); + } + + if (err) { + /* + * If this counter can't go on and it's part of a + * group, then the whole group has to come off. + */ + if (leader != counter) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_COUNTER_STATE_ERROR; + } + } + + unlock: + spin_unlock(&ctx->lock); +} + +/* + * Enable a counter. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid. This condition is satisfied when called through + * perf_counter_for_each_child or perf_counter_for_each as described + * for perf_counter_disable. + */ +static void perf_counter_enable(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Enable the counter on the cpu that it's on + */ + smp_call_function_single(counter->cpu, __perf_counter_enable, + counter, 1); + return; + } + + spin_lock_irq(&ctx->lock); + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + goto out; + + /* + * If the counter is in error state, clear that first. + * That way, if we see the counter in error state below, we + * know that it has gone back into error state, as distinct + * from the task having been scheduled away before the + * cross-call arrived. + */ + if (counter->state == PERF_COUNTER_STATE_ERROR) + counter->state = PERF_COUNTER_STATE_OFF; + + retry: + spin_unlock_irq(&ctx->lock); + task_oncpu_function_call(task, __perf_counter_enable, counter); + + spin_lock_irq(&ctx->lock); + + /* + * If the context is active and the counter is still off, + * we need to retry the cross-call. + */ + if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) + goto retry; + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (counter->state == PERF_COUNTER_STATE_OFF) { + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = + ctx->time - counter->total_time_enabled; + } + out: + spin_unlock_irq(&ctx->lock); +} + +static int perf_counter_refresh(struct perf_counter *counter, int refresh) +{ + /* + * not supported on inherited counters + */ + if (counter->attr.inherit) + return -EINVAL; + + atomic_add(refresh, &counter->event_limit); + perf_counter_enable(counter); + + return 0; +} + +void __perf_counter_sched_out(struct perf_counter_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct perf_counter *counter; + + spin_lock(&ctx->lock); + ctx->is_active = 0; + if (likely(!ctx->nr_counters)) + goto out; + update_context_time(ctx); + + perf_disable(); + if (ctx->nr_active) { + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter != counter->group_leader) + counter_sched_out(counter, cpuctx, ctx); + else + group_sched_out(counter, cpuctx, ctx); + } + } + perf_enable(); + out: + spin_unlock(&ctx->lock); +} + +/* + * Test whether two contexts are equivalent, i.e. whether they + * have both been cloned from the same version of the same context + * and they both have the same number of enabled counters. + * If the number of enabled counters is the same, then the set + * of enabled counters should be the same, because these are both + * inherited contexts, therefore we can't access individual counters + * in them directly with an fd; we can only enable/disable all + * counters via prctl, or enable/disable all counters in a family + * via ioctl, which will have the same effect on both contexts. + */ +static int context_equiv(struct perf_counter_context *ctx1, + struct perf_counter_context *ctx2) +{ + return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx + && ctx1->parent_gen == ctx2->parent_gen + && !ctx1->pin_count && !ctx2->pin_count; +} + +/* + * Called from scheduler to remove the counters of the current task, + * with interrupts disabled. + * + * We stop each counter and update the counter value in counter->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of counter _before_ + * accessing the counter control register. If a NMI hits, then it will + * not restart the counter. + */ +void perf_counter_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx = task->perf_counter_ctxp; + struct perf_counter_context *next_ctx; + struct perf_counter_context *parent; + struct pt_regs *regs; + int do_switch = 1; + + regs = task_pt_regs(task); + perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); + + if (likely(!ctx || !cpuctx->task_ctx)) + return; + + update_context_time(ctx); + + rcu_read_lock(); + parent = rcu_dereference(ctx->parent_ctx); + next_ctx = next->perf_counter_ctxp; + if (parent && next_ctx && + rcu_dereference(next_ctx->parent_ctx) == parent) { + /* + * Looks like the two contexts are clones, so we might be + * able to optimize the context switch. We lock both + * contexts and check that they are clones under the + * lock (including re-checking that neither has been + * uncloned in the meantime). It doesn't matter which + * order we take the locks because no other cpu could + * be trying to lock both of these tasks. + */ + spin_lock(&ctx->lock); + spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + if (context_equiv(ctx, next_ctx)) { + /* + * XXX do we need a memory barrier of sorts + * wrt to rcu_dereference() of perf_counter_ctxp + */ + task->perf_counter_ctxp = next_ctx; + next->perf_counter_ctxp = ctx; + ctx->task = next; + next_ctx->task = task; + do_switch = 0; + } + spin_unlock(&next_ctx->lock); + spin_unlock(&ctx->lock); + } + rcu_read_unlock(); + + if (do_switch) { + __perf_counter_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; + } +} + +/* + * Called with IRQs disabled + */ +static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + __perf_counter_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; +} + +/* + * Called with IRQs disabled + */ +static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) +{ + __perf_counter_sched_out(&cpuctx->ctx, cpuctx); +} + +static void +__perf_counter_sched_in(struct perf_counter_context *ctx, + struct perf_cpu_context *cpuctx, int cpu) +{ + struct perf_counter *counter; + int can_add_hw = 1; + + spin_lock(&ctx->lock); + ctx->is_active = 1; + if (likely(!ctx->nr_counters)) + goto out; + + ctx->timestamp = perf_clock(); + + perf_disable(); + + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. + */ + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter->state <= PERF_COUNTER_STATE_OFF || + !counter->attr.pinned) + continue; + if (counter->cpu != -1 && counter->cpu != cpu) + continue; + + if (counter != counter->group_leader) + counter_sched_in(counter, cpuctx, ctx, cpu); + else { + if (group_can_go_on(counter, cpuctx, 1)) + group_sched_in(counter, cpuctx, ctx, cpu); + } + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_group_times(counter); + counter->state = PERF_COUNTER_STATE_ERROR; + } + } + + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + /* + * Ignore counters in OFF or ERROR state, and + * ignore pinned counters since we did them already. + */ + if (counter->state <= PERF_COUNTER_STATE_OFF || + counter->attr.pinned) + continue; + + /* + * Listen to the 'cpu' scheduling filter constraint + * of counters: + */ + if (counter->cpu != -1 && counter->cpu != cpu) + continue; + + if (counter != counter->group_leader) { + if (counter_sched_in(counter, cpuctx, ctx, cpu)) + can_add_hw = 0; + } else { + if (group_can_go_on(counter, cpuctx, can_add_hw)) { + if (group_sched_in(counter, cpuctx, ctx, cpu)) + can_add_hw = 0; + } + } + } + perf_enable(); + out: + spin_unlock(&ctx->lock); +} + +/* + * Called from scheduler to add the counters of the current task + * with interrupts disabled. + * + * We restore the counter value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of counter _before_ + * accessing the counter control register. If a NMI hits, then it will + * keep the counter running. + */ +void perf_counter_task_sched_in(struct task_struct *task, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx = task->perf_counter_ctxp; + + if (likely(!ctx)) + return; + if (cpuctx->task_ctx == ctx) + return; + __perf_counter_sched_in(ctx, cpuctx, cpu); + cpuctx->task_ctx = ctx; +} + +static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +{ + struct perf_counter_context *ctx = &cpuctx->ctx; + + __perf_counter_sched_in(ctx, cpuctx, cpu); +} + +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_counter *counter, int enable); +static void perf_log_period(struct perf_counter *counter, u64 period); + +static void perf_adjust_period(struct perf_counter *counter, u64 events) +{ + struct hw_perf_counter *hwc = &counter->hw; + u64 period, sample_period; + s64 delta; + + events *= hwc->sample_period; + period = div64_u64(events, counter->attr.sample_freq); + + delta = (s64)(period - hwc->sample_period); + delta = (delta + 7) / 8; /* low pass filter */ + + sample_period = hwc->sample_period + delta; + + if (!sample_period) + sample_period = 1; + + perf_log_period(counter, sample_period); + + hwc->sample_period = sample_period; +} + +static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) +{ + struct perf_counter *counter; + struct hw_perf_counter *hwc; + u64 interrupts, freq; + + spin_lock(&ctx->lock); + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter->state != PERF_COUNTER_STATE_ACTIVE) + continue; + + hwc = &counter->hw; + + interrupts = hwc->interrupts; + hwc->interrupts = 0; + + /* + * unthrottle counters on the tick + */ + if (interrupts == MAX_INTERRUPTS) { + perf_log_throttle(counter, 1); + counter->pmu->unthrottle(counter); + interrupts = 2*sysctl_perf_counter_sample_rate/HZ; + } + + if (!counter->attr.freq || !counter->attr.sample_freq) + continue; + + /* + * if the specified freq < HZ then we need to skip ticks + */ + if (counter->attr.sample_freq < HZ) { + freq = counter->attr.sample_freq; + + hwc->freq_count += freq; + hwc->freq_interrupts += interrupts; + + if (hwc->freq_count < HZ) + continue; + + interrupts = hwc->freq_interrupts; + hwc->freq_interrupts = 0; + hwc->freq_count -= HZ; + } else + freq = HZ; + + perf_adjust_period(counter, freq * interrupts); + + /* + * In order to avoid being stalled by an (accidental) huge + * sample period, force reset the sample period if we didn't + * get any events in this freq period. + */ + if (!interrupts) { + perf_disable(); + counter->pmu->disable(counter); + atomic_set(&hwc->period_left, 0); + counter->pmu->enable(counter); + perf_enable(); + } + } + spin_unlock(&ctx->lock); +} + +/* + * Round-robin a context's counters: + */ +static void rotate_ctx(struct perf_counter_context *ctx) +{ + struct perf_counter *counter; + + if (!ctx->nr_counters) + return; + + spin_lock(&ctx->lock); + /* + * Rotate the first entry last (works just fine for group counters too): + */ + perf_disable(); + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + list_move_tail(&counter->list_entry, &ctx->counter_list); + break; + } + perf_enable(); + + spin_unlock(&ctx->lock); +} + +void perf_counter_task_tick(struct task_struct *curr, int cpu) +{ + struct perf_cpu_context *cpuctx; + struct perf_counter_context *ctx; + + if (!atomic_read(&nr_counters)) + return; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = curr->perf_counter_ctxp; + + perf_ctx_adjust_freq(&cpuctx->ctx); + if (ctx) + perf_ctx_adjust_freq(ctx); + + perf_counter_cpu_sched_out(cpuctx); + if (ctx) + __perf_counter_task_sched_out(ctx); + + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); + + perf_counter_cpu_sched_in(cpuctx, cpu); + if (ctx) + perf_counter_task_sched_in(curr, cpu); +} + +/* + * Cross CPU call to read the hardware counter + */ +static void __read(void *info) +{ + struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; + unsigned long flags; + + local_irq_save(flags); + if (ctx->is_active) + update_context_time(ctx); + counter->pmu->read(counter); + update_counter_times(counter); + local_irq_restore(flags); +} + +static u64 perf_counter_read(struct perf_counter *counter) +{ + /* + * If counter is enabled and currently active on a CPU, update the + * value in the counter structure: + */ + if (counter->state == PERF_COUNTER_STATE_ACTIVE) { + smp_call_function_single(counter->oncpu, + __read, counter, 1); + } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_counter_times(counter); + } + + return atomic64_read(&counter->count); +} + +/* + * Initialize the perf_counter context in a task_struct: + */ +static void +__perf_counter_init_context(struct perf_counter_context *ctx, + struct task_struct *task) +{ + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->counter_list); + INIT_LIST_HEAD(&ctx->event_list); + atomic_set(&ctx->refcount, 1); + ctx->task = task; +} + +static struct perf_counter_context *find_get_context(pid_t pid, int cpu) +{ + struct perf_counter_context *parent_ctx; + struct perf_counter_context *ctx; + struct perf_cpu_context *cpuctx; + struct task_struct *task; + unsigned long flags; + int err; + + /* + * If cpu is not a wildcard then this is a percpu counter: + */ + if (cpu != -1) { + /* Must be root to operate on a CPU counter: */ + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EACCES); + + if (cpu < 0 || cpu > num_possible_cpus()) + return ERR_PTR(-EINVAL); + + /* + * We could be clever and allow to attach a counter to an + * offline CPU and activate it when the CPU comes up, but + * that's for later. + */ + if (!cpu_isset(cpu, cpu_online_map)) + return ERR_PTR(-ENODEV); + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = &cpuctx->ctx; + get_ctx(ctx); + + return ctx; + } + + rcu_read_lock(); + if (!pid) + task = current; + else + task = find_task_by_vpid(pid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + if (!task) + return ERR_PTR(-ESRCH); + + /* + * Can't attach counters to a dying task. + */ + err = -ESRCH; + if (task->flags & PF_EXITING) + goto errout; + + /* Reuse ptrace permission checks for now. */ + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto errout; + + retry: + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + parent_ctx = ctx->parent_ctx; + if (parent_ctx) { + put_ctx(parent_ctx); + ctx->parent_ctx = NULL; /* no longer a clone */ + } + /* + * Get an extra reference before dropping the lock so that + * this context won't get freed if the task exits. + */ + get_ctx(ctx); + spin_unlock_irqrestore(&ctx->lock, flags); + } + + if (!ctx) { + ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); + err = -ENOMEM; + if (!ctx) + goto errout; + __perf_counter_init_context(ctx, task); + get_ctx(ctx); + if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) { + /* + * We raced with some other task; use + * the context they set. + */ + kfree(ctx); + goto retry; + } + get_task_struct(task); + } + + put_task_struct(task); + return ctx; + + errout: + put_task_struct(task); + return ERR_PTR(err); +} + +static void free_counter_rcu(struct rcu_head *head) +{ + struct perf_counter *counter; + + counter = container_of(head, struct perf_counter, rcu_head); + if (counter->ns) + put_pid_ns(counter->ns); + kfree(counter); +} + +static void perf_pending_sync(struct perf_counter *counter); + +static void free_counter(struct perf_counter *counter) +{ + perf_pending_sync(counter); + + atomic_dec(&nr_counters); + if (counter->attr.mmap) + atomic_dec(&nr_mmap_counters); + if (counter->attr.comm) + atomic_dec(&nr_comm_counters); + + if (counter->destroy) + counter->destroy(counter); + + put_ctx(counter->ctx); + call_rcu(&counter->rcu_head, free_counter_rcu); +} + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_counter *counter = file->private_data; + struct perf_counter_context *ctx = counter->ctx; + + file->private_data = NULL; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_counter_remove_from_context(counter); + mutex_unlock(&ctx->mutex); + + mutex_lock(&counter->owner->perf_counter_mutex); + list_del_init(&counter->owner_entry); + mutex_unlock(&counter->owner->perf_counter_mutex); + put_task_struct(counter->owner); + + free_counter(counter); + + return 0; +} + +/* + * Read the performance counter - simple non blocking version for now + */ +static ssize_t +perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) +{ + u64 values[3]; + int n; + + /* + * Return end-of-file for a read on a counter that is in + * error state (i.e. because it was pinned but it couldn't be + * scheduled on to the CPU at some point). + */ + if (counter->state == PERF_COUNTER_STATE_ERROR) + return 0; + + WARN_ON_ONCE(counter->ctx->parent_ctx); + mutex_lock(&counter->child_mutex); + values[0] = perf_counter_read(counter); + n = 1; + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + if (counter->attr.read_format & PERF_FORMAT_ID) + values[n++] = counter->id; + mutex_unlock(&counter->child_mutex); + + if (count < n * sizeof(u64)) + return -EINVAL; + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; +} + +static ssize_t +perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct perf_counter *counter = file->private_data; + + return perf_read_hw(counter, buf, count); +} + +static unsigned int perf_poll(struct file *file, poll_table *wait) +{ + struct perf_counter *counter = file->private_data; + struct perf_mmap_data *data; + unsigned int events = POLL_HUP; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (data) + events = atomic_xchg(&data->poll, 0); + rcu_read_unlock(); + + poll_wait(file, &counter->waitq, wait); + + return events; +} + +static void perf_counter_reset(struct perf_counter *counter) +{ + (void)perf_counter_read(counter); + atomic64_set(&counter->count, 0); + perf_counter_update_userpage(counter); +} + +static void perf_counter_for_each_sibling(struct perf_counter *counter, + void (*func)(struct perf_counter *)) +{ + struct perf_counter_context *ctx = counter->ctx; + struct perf_counter *sibling; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + counter = counter->group_leader; + + func(counter); + list_for_each_entry(sibling, &counter->sibling_list, list_entry) + func(sibling); + mutex_unlock(&ctx->mutex); +} + +/* + * Holding the top-level counter's child_mutex means that any + * descendant process that has inherited this counter will block + * in sync_child_counter if it goes to exit, thus satisfying the + * task existence requirements of perf_counter_enable/disable. + */ +static void perf_counter_for_each_child(struct perf_counter *counter, + void (*func)(struct perf_counter *)) +{ + struct perf_counter *child; + + WARN_ON_ONCE(counter->ctx->parent_ctx); + mutex_lock(&counter->child_mutex); + func(counter); + list_for_each_entry(child, &counter->child_list, child_list) + func(child); + mutex_unlock(&counter->child_mutex); +} + +static void perf_counter_for_each(struct perf_counter *counter, + void (*func)(struct perf_counter *)) +{ + struct perf_counter *child; + + WARN_ON_ONCE(counter->ctx->parent_ctx); + mutex_lock(&counter->child_mutex); + perf_counter_for_each_sibling(counter, func); + list_for_each_entry(child, &counter->child_list, child_list) + perf_counter_for_each_sibling(child, func); + mutex_unlock(&counter->child_mutex); +} + +static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) +{ + struct perf_counter_context *ctx = counter->ctx; + unsigned long size; + int ret = 0; + u64 value; + + if (!counter->attr.sample_period) + return -EINVAL; + + size = copy_from_user(&value, arg, sizeof(value)); + if (size != sizeof(value)) + return -EFAULT; + + if (!value) + return -EINVAL; + + spin_lock_irq(&ctx->lock); + if (counter->attr.freq) { + if (value > sysctl_perf_counter_sample_rate) { + ret = -EINVAL; + goto unlock; + } + + counter->attr.sample_freq = value; + } else { + perf_log_period(counter, value); + + counter->attr.sample_period = value; + counter->hw.sample_period = value; + } +unlock: + spin_unlock_irq(&ctx->lock); + + return ret; +} + +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_counter *counter = file->private_data; + void (*func)(struct perf_counter *); + u32 flags = arg; + + switch (cmd) { + case PERF_COUNTER_IOC_ENABLE: + func = perf_counter_enable; + break; + case PERF_COUNTER_IOC_DISABLE: + func = perf_counter_disable; + break; + case PERF_COUNTER_IOC_RESET: + func = perf_counter_reset; + break; + + case PERF_COUNTER_IOC_REFRESH: + return perf_counter_refresh(counter, arg); + + case PERF_COUNTER_IOC_PERIOD: + return perf_counter_period(counter, (u64 __user *)arg); + + default: + return -ENOTTY; + } + + if (flags & PERF_IOC_FLAG_GROUP) + perf_counter_for_each(counter, func); + else + perf_counter_for_each_child(counter, func); + + return 0; +} + +int perf_counter_task_enable(void) +{ + struct perf_counter *counter; + + mutex_lock(¤t->perf_counter_mutex); + list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) + perf_counter_for_each_child(counter, perf_counter_enable); + mutex_unlock(¤t->perf_counter_mutex); + + return 0; +} + +int perf_counter_task_disable(void) +{ + struct perf_counter *counter; + + mutex_lock(¤t->perf_counter_mutex); + list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) + perf_counter_for_each_child(counter, perf_counter_disable); + mutex_unlock(¤t->perf_counter_mutex); + + return 0; +} + +/* + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. + */ +void perf_counter_update_userpage(struct perf_counter *counter) +{ + struct perf_counter_mmap_page *userpg; + struct perf_mmap_data *data; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (!data) + goto unlock; + + userpg = data->user_page; + + /* + * Disable preemption so as to not let the corresponding user-space + * spin too long if we get preempted. + */ + preempt_disable(); + ++userpg->lock; + barrier(); + userpg->index = counter->hw.idx; + userpg->offset = atomic64_read(&counter->count); + if (counter->state == PERF_COUNTER_STATE_ACTIVE) + userpg->offset -= atomic64_read(&counter->hw.prev_count); + + barrier(); + ++userpg->lock; + preempt_enable(); +unlock: + rcu_read_unlock(); +} + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_counter *counter = vma->vm_file->private_data; + struct perf_mmap_data *data; + int ret = VM_FAULT_SIGBUS; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (!data) + goto unlock; + + if (vmf->pgoff == 0) { + vmf->page = virt_to_page(data->user_page); + } else { + int nr = vmf->pgoff - 1; + + if ((unsigned)nr > data->nr_pages) + goto unlock; + + vmf->page = virt_to_page(data->data_pages[nr]); + } + get_page(vmf->page); + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) +{ + struct perf_mmap_data *data; + unsigned long size; + int i; + + WARN_ON(atomic_read(&counter->mmap_count)); + + size = sizeof(struct perf_mmap_data); + size += nr_pages * sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; + + data->user_page = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->user_page) + goto fail_user_page; + + for (i = 0; i < nr_pages; i++) { + data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->data_pages[i]) + goto fail_data_pages; + } + + data->nr_pages = nr_pages; + atomic_set(&data->lock, -1); + + rcu_assign_pointer(counter->data, data); + + return 0; + +fail_data_pages: + for (i--; i >= 0; i--) + free_page((unsigned long)data->data_pages[i]); + + free_page((unsigned long)data->user_page); + +fail_user_page: + kfree(data); + +fail: + return -ENOMEM; +} + +static void __perf_mmap_data_free(struct rcu_head *rcu_head) +{ + struct perf_mmap_data *data; + int i; + + data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + + free_page((unsigned long)data->user_page); + for (i = 0; i < data->nr_pages; i++) + free_page((unsigned long)data->data_pages[i]); + kfree(data); +} + +static void perf_mmap_data_free(struct perf_counter *counter) +{ + struct perf_mmap_data *data = counter->data; + + WARN_ON(atomic_read(&counter->mmap_count)); + + rcu_assign_pointer(counter->data, NULL); + call_rcu(&data->rcu_head, __perf_mmap_data_free); +} + +static void perf_mmap_open(struct vm_area_struct *vma) +{ + struct perf_counter *counter = vma->vm_file->private_data; + + atomic_inc(&counter->mmap_count); +} + +static void perf_mmap_close(struct vm_area_struct *vma) +{ + struct perf_counter *counter = vma->vm_file->private_data; + + WARN_ON_ONCE(counter->ctx->parent_ctx); + if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { + struct user_struct *user = current_user(); + + atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); + vma->vm_mm->locked_vm -= counter->data->nr_locked; + perf_mmap_data_free(counter); + mutex_unlock(&counter->mmap_mutex); + } +} + +static struct vm_operations_struct perf_mmap_vmops = { + .open = perf_mmap_open, + .close = perf_mmap_close, + .fault = perf_mmap_fault, +}; + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_counter *counter = file->private_data; + unsigned long user_locked, user_lock_limit; + struct user_struct *user = current_user(); + unsigned long locked, lock_limit; + unsigned long vma_size; + unsigned long nr_pages; + long user_extra, extra; + int ret = 0; + + if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) + return -EINVAL; + + vma_size = vma->vm_end - vma->vm_start; + nr_pages = (vma_size / PAGE_SIZE) - 1; + + /* + * If we have data pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; + + if (vma_size != PAGE_SIZE * (1 + nr_pages)) + return -EINVAL; + + if (vma->vm_pgoff != 0) + return -EINVAL; + + WARN_ON_ONCE(counter->ctx->parent_ctx); + mutex_lock(&counter->mmap_mutex); + if (atomic_inc_not_zero(&counter->mmap_count)) { + if (nr_pages != counter->data->nr_pages) + ret = -EINVAL; + goto unlock; + } + + user_extra = nr_pages + 1; + user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); + + /* + * Increase the limit linearly with more CPUs: + */ + user_lock_limit *= num_online_cpus(); + + user_locked = atomic_long_read(&user->locked_vm) + user_extra; + + extra = 0; + if (user_locked > user_lock_limit) + extra = user_locked - user_lock_limit; + + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + locked = vma->vm_mm->locked_vm + extra; + + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { + ret = -EPERM; + goto unlock; + } + + WARN_ON(counter->data); + ret = perf_mmap_data_alloc(counter, nr_pages); + if (ret) + goto unlock; + + atomic_set(&counter->mmap_count, 1); + atomic_long_add(user_extra, &user->locked_vm); + vma->vm_mm->locked_vm += extra; + counter->data->nr_locked = extra; +unlock: + mutex_unlock(&counter->mmap_mutex); + + vma->vm_flags &= ~VM_MAYWRITE; + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &perf_mmap_vmops; + + return ret; +} + +static int perf_fasync(int fd, struct file *filp, int on) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct perf_counter *counter = filp->private_data; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &counter->fasync); + mutex_unlock(&inode->i_mutex); + + if (retval < 0) + return retval; + + return 0; +} + +static const struct file_operations perf_fops = { + .release = perf_release, + .read = perf_read, + .poll = perf_poll, + .unlocked_ioctl = perf_ioctl, + .compat_ioctl = perf_ioctl, + .mmap = perf_mmap, + .fasync = perf_fasync, +}; + +/* + * Perf counter wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_counter_wakeup(struct perf_counter *counter) +{ + wake_up_all(&counter->waitq); + + if (counter->pending_kill) { + kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); + counter->pending_kill = 0; + } +} + +/* + * Pending wakeups + * + * Handle the case where we need to wakeup up from NMI (or rq->lock) context. + * + * The NMI bit means we cannot possibly take locks. Therefore, maintain a + * single linked list and use cmpxchg() to add entries lockless. + */ + +static void perf_pending_counter(struct perf_pending_entry *entry) +{ + struct perf_counter *counter = container_of(entry, + struct perf_counter, pending); + + if (counter->pending_disable) { + counter->pending_disable = 0; + perf_counter_disable(counter); + } + + if (counter->pending_wakeup) { + counter->pending_wakeup = 0; + perf_counter_wakeup(counter); + } +} + +#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) + +static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { + PENDING_TAIL, +}; + +static void perf_pending_queue(struct perf_pending_entry *entry, + void (*func)(struct perf_pending_entry *)) +{ + struct perf_pending_entry **head; + + if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) + return; + + entry->func = func; + + head = &get_cpu_var(perf_pending_head); + + do { + entry->next = *head; + } while (cmpxchg(head, entry->next, entry) != entry->next); + + set_perf_counter_pending(); + + put_cpu_var(perf_pending_head); +} + +static int __perf_pending_run(void) +{ + struct perf_pending_entry *list; + int nr = 0; + + list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); + while (list != PENDING_TAIL) { + void (*func)(struct perf_pending_entry *); + struct perf_pending_entry *entry = list; + + list = list->next; + + func = entry->func; + entry->next = NULL; + /* + * Ensure we observe the unqueue before we issue the wakeup, + * so that we won't be waiting forever. + * -- see perf_not_pending(). + */ + smp_wmb(); + + func(entry); + nr++; + } + + return nr; +} + +static inline int perf_not_pending(struct perf_counter *counter) +{ + /* + * If we flush on whatever cpu we run, there is a chance we don't + * need to wait. + */ + get_cpu(); + __perf_pending_run(); + put_cpu(); + + /* + * Ensure we see the proper queue state before going to sleep + * so that we do not miss the wakeup. -- see perf_pending_handle() + */ + smp_rmb(); + return counter->pending.next == NULL; +} + +static void perf_pending_sync(struct perf_counter *counter) +{ + wait_event(counter->waitq, perf_not_pending(counter)); +} + +void perf_counter_do_pending(void) +{ + __perf_pending_run(); +} + +/* + * Callchain support -- arch specific + */ + +__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + return NULL; +} + +/* + * Output + */ + +struct perf_output_handle { + struct perf_counter *counter; + struct perf_mmap_data *data; + unsigned long head; + unsigned long offset; + int nmi; + int overflow; + int locked; + unsigned long flags; +}; + +static void perf_output_wakeup(struct perf_output_handle *handle) +{ + atomic_set(&handle->data->poll, POLL_IN); + + if (handle->nmi) { + handle->counter->pending_wakeup = 1; + perf_pending_queue(&handle->counter->pending, + perf_pending_counter); + } else + perf_counter_wakeup(handle->counter); +} + +/* + * Curious locking construct. + * + * We need to ensure a later event doesn't publish a head when a former + * event isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * What we do is serialize between CPUs so we only have to deal with NMI + * nesting on a single CPU. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event completes. + */ +static void perf_output_lock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int cpu; + + handle->locked = 0; + + local_irq_save(handle->flags); + cpu = smp_processor_id(); + + if (in_nmi() && atomic_read(&data->lock) == cpu) + return; + + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) + cpu_relax(); + + handle->locked = 1; +} + +static void perf_output_unlock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + unsigned long head; + int cpu; + + data->done_head = data->head; + + if (!handle->locked) + goto out; + +again: + /* + * The xchg implies a full barrier that ensures all writes are done + * before we publish the new head, matched by a rmb() in userspace when + * reading this position. + */ + while ((head = atomic_long_xchg(&data->done_head, 0))) + data->user_page->data_head = head; + + /* + * NMI can happen here, which means we can miss a done_head update. + */ + + cpu = atomic_xchg(&data->lock, -1); + WARN_ON_ONCE(cpu != smp_processor_id()); + + /* + * Therefore we have to validate we did not indeed do so. + */ + if (unlikely(atomic_long_read(&data->done_head))) { + /* + * Since we had it locked, we can lock it again. + */ + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) + cpu_relax(); + + goto again; + } + + if (atomic_xchg(&data->wakeup, 0)) + perf_output_wakeup(handle); +out: + local_irq_restore(handle->flags); +} + +static int perf_output_begin(struct perf_output_handle *handle, + struct perf_counter *counter, unsigned int size, + int nmi, int overflow) +{ + struct perf_mmap_data *data; + unsigned int offset, head; + + /* + * For inherited counters we send all the output towards the parent. + */ + if (counter->parent) + counter = counter->parent; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (!data) + goto out; + + handle->data = data; + handle->counter = counter; + handle->nmi = nmi; + handle->overflow = overflow; + + if (!data->nr_pages) + goto fail; + + perf_output_lock(handle); + + do { + offset = head = atomic_long_read(&data->head); + head += size; + } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); + + handle->offset = offset; + handle->head = head; + + if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) + atomic_set(&data->wakeup, 1); + + return 0; + +fail: + perf_output_wakeup(handle); +out: + rcu_read_unlock(); + + return -ENOSPC; +} + +static void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + unsigned int pages_mask; + unsigned int offset; + unsigned int size; + void **pages; + + offset = handle->offset; + pages_mask = handle->data->nr_pages - 1; + pages = handle->data->data_pages; + + do { + unsigned int page_offset; + int nr; + + nr = (offset >> PAGE_SHIFT) & pages_mask; + page_offset = offset & (PAGE_SIZE - 1); + size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + + memcpy(pages[nr] + page_offset, buf, size); + + len -= size; + buf += size; + offset += size; + } while (len); + + handle->offset = offset; + + /* + * Check we didn't copy past our reservation window, taking the + * possible unsigned int wrap into account. + */ + WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); +} + +#define perf_output_put(handle, x) \ + perf_output_copy((handle), &(x), sizeof(x)) + +static void perf_output_end(struct perf_output_handle *handle) +{ + struct perf_counter *counter = handle->counter; + struct perf_mmap_data *data = handle->data; + + int wakeup_events = counter->attr.wakeup_events; + + if (handle->overflow && wakeup_events) { + int events = atomic_inc_return(&data->events); + if (events >= wakeup_events) { + atomic_sub(wakeup_events, &data->events); + atomic_set(&data->wakeup, 1); + } + } + + perf_output_unlock(handle); + rcu_read_unlock(); +} + +static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p) +{ + /* + * only top level counters have the pid namespace they were created in + */ + if (counter->parent) + counter = counter->parent; + + return task_tgid_nr_ns(p, counter->ns); +} + +static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) +{ + /* + * only top level counters have the pid namespace they were created in + */ + if (counter->parent) + counter = counter->parent; + + return task_pid_nr_ns(p, counter->ns); +} + +static void perf_counter_output(struct perf_counter *counter, int nmi, + struct perf_sample_data *data) +{ + int ret; + u64 sample_type = counter->attr.sample_type; + struct perf_output_handle handle; + struct perf_event_header header; + u64 ip; + struct { + u32 pid, tid; + } tid_entry; + struct { + u64 id; + u64 counter; + } group_entry; + struct perf_callchain_entry *callchain = NULL; + int callchain_size = 0; + u64 time; + struct { + u32 cpu, reserved; + } cpu_entry; + + header.type = 0; + header.size = sizeof(header); + + header.misc = PERF_EVENT_MISC_OVERFLOW; + header.misc |= perf_misc_flags(data->regs); + + if (sample_type & PERF_SAMPLE_IP) { + ip = perf_instruction_pointer(data->regs); + header.type |= PERF_SAMPLE_IP; + header.size += sizeof(ip); + } + + if (sample_type & PERF_SAMPLE_TID) { + /* namespace issues */ + tid_entry.pid = perf_counter_pid(counter, current); + tid_entry.tid = perf_counter_tid(counter, current); + + header.type |= PERF_SAMPLE_TID; + header.size += sizeof(tid_entry); + } + + if (sample_type & PERF_SAMPLE_TIME) { + /* + * Maybe do better on x86 and provide cpu_clock_nmi() + */ + time = sched_clock(); + + header.type |= PERF_SAMPLE_TIME; + header.size += sizeof(u64); + } + + if (sample_type & PERF_SAMPLE_ADDR) { + header.type |= PERF_SAMPLE_ADDR; + header.size += sizeof(u64); + } + + if (sample_type & PERF_SAMPLE_ID) { + header.type |= PERF_SAMPLE_ID; + header.size += sizeof(u64); + } + + if (sample_type & PERF_SAMPLE_CPU) { + header.type |= PERF_SAMPLE_CPU; + header.size += sizeof(cpu_entry); + + cpu_entry.cpu = raw_smp_processor_id(); + } + + if (sample_type & PERF_SAMPLE_PERIOD) { + header.type |= PERF_SAMPLE_PERIOD; + header.size += sizeof(u64); + } + + if (sample_type & PERF_SAMPLE_GROUP) { + header.type |= PERF_SAMPLE_GROUP; + header.size += sizeof(u64) + + counter->nr_siblings * sizeof(group_entry); + } + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + callchain = perf_callchain(data->regs); + + if (callchain) { + callchain_size = (1 + callchain->nr) * sizeof(u64); + + header.type |= PERF_SAMPLE_CALLCHAIN; + header.size += callchain_size; + } + } + + ret = perf_output_begin(&handle, counter, header.size, nmi, 1); + if (ret) + return; + + perf_output_put(&handle, header); + + if (sample_type & PERF_SAMPLE_IP) + perf_output_put(&handle, ip); + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(&handle, tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(&handle, time); + + if (sample_type & PERF_SAMPLE_ADDR) + perf_output_put(&handle, data->addr); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(&handle, counter->id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(&handle, cpu_entry); + + if (sample_type & PERF_SAMPLE_PERIOD) + perf_output_put(&handle, data->period); + + /* + * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. + */ + if (sample_type & PERF_SAMPLE_GROUP) { + struct perf_counter *leader, *sub; + u64 nr = counter->nr_siblings; + + perf_output_put(&handle, nr); + + leader = counter->group_leader; + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + if (sub != counter) + sub->pmu->read(sub); + + group_entry.id = sub->id; + group_entry.counter = atomic64_read(&sub->count); + + perf_output_put(&handle, group_entry); + } + } + + if (callchain) + perf_output_copy(&handle, callchain, callchain_size); + + perf_output_end(&handle); +} + +/* + * fork tracking + */ + +struct perf_fork_event { + struct task_struct *task; + + struct { + struct perf_event_header header; + + u32 pid; + u32 ppid; + } event; +}; + +static void perf_counter_fork_output(struct perf_counter *counter, + struct perf_fork_event *fork_event) +{ + struct perf_output_handle handle; + int size = fork_event->event.header.size; + struct task_struct *task = fork_event->task; + int ret = perf_output_begin(&handle, counter, size, 0, 0); + + if (ret) + return; + + fork_event->event.pid = perf_counter_pid(counter, task); + fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); + + perf_output_put(&handle, fork_event->event); + perf_output_end(&handle); +} + +static int perf_counter_fork_match(struct perf_counter *counter) +{ + if (counter->attr.comm || counter->attr.mmap) + return 1; + + return 0; +} + +static void perf_counter_fork_ctx(struct perf_counter_context *ctx, + struct perf_fork_event *fork_event) +{ + struct perf_counter *counter; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_counter_fork_match(counter)) + perf_counter_fork_output(counter, fork_event); + } + rcu_read_unlock(); +} + +static void perf_counter_fork_event(struct perf_fork_event *fork_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_counter_context *ctx; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_counter_fork_ctx(&cpuctx->ctx, fork_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_counter_ctxp); + if (ctx) + perf_counter_fork_ctx(ctx, fork_event); + rcu_read_unlock(); +} + +void perf_counter_fork(struct task_struct *task) +{ + struct perf_fork_event fork_event; + + if (!atomic_read(&nr_comm_counters) && + !atomic_read(&nr_mmap_counters)) + return; + + fork_event = (struct perf_fork_event){ + .task = task, + .event = { + .header = { + .type = PERF_EVENT_FORK, + .size = sizeof(fork_event.event), + }, + }, + }; + + perf_counter_fork_event(&fork_event); +} + +/* + * comm tracking + */ + +struct perf_comm_event { + struct task_struct *task; + char *comm; + int comm_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + } event; +}; + +static void perf_counter_comm_output(struct perf_counter *counter, + struct perf_comm_event *comm_event) +{ + struct perf_output_handle handle; + int size = comm_event->event.header.size; + int ret = perf_output_begin(&handle, counter, size, 0, 0); + + if (ret) + return; + + comm_event->event.pid = perf_counter_pid(counter, comm_event->task); + comm_event->event.tid = perf_counter_tid(counter, comm_event->task); + + perf_output_put(&handle, comm_event->event); + perf_output_copy(&handle, comm_event->comm, + comm_event->comm_size); + perf_output_end(&handle); +} + +static int perf_counter_comm_match(struct perf_counter *counter) +{ + if (counter->attr.comm) + return 1; + + return 0; +} + +static void perf_counter_comm_ctx(struct perf_counter_context *ctx, + struct perf_comm_event *comm_event) +{ + struct perf_counter *counter; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_counter_comm_match(counter)) + perf_counter_comm_output(counter, comm_event); + } + rcu_read_unlock(); +} + +static void perf_counter_comm_event(struct perf_comm_event *comm_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_counter_context *ctx; + unsigned int size; + char *comm = comm_event->task->comm; + + size = ALIGN(strlen(comm)+1, sizeof(u64)); + + comm_event->comm = comm; + comm_event->comm_size = size; + + comm_event->event.header.size = sizeof(comm_event->event) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_counter_comm_ctx(&cpuctx->ctx, comm_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_counter_ctxp); + if (ctx) + perf_counter_comm_ctx(ctx, comm_event); + rcu_read_unlock(); +} + +void perf_counter_comm(struct task_struct *task) +{ + struct perf_comm_event comm_event; + + if (!atomic_read(&nr_comm_counters)) + return; + + comm_event = (struct perf_comm_event){ + .task = task, + .event = { + .header = { .type = PERF_EVENT_COMM, }, + }, + }; + + perf_counter_comm_event(&comm_event); +} + +/* + * mmap tracking + */ + +struct perf_mmap_event { + struct vm_area_struct *vma; + + const char *file_name; + int file_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 start; + u64 len; + u64 pgoff; + } event; +}; + +static void perf_counter_mmap_output(struct perf_counter *counter, + struct perf_mmap_event *mmap_event) +{ + struct perf_output_handle handle; + int size = mmap_event->event.header.size; + int ret = perf_output_begin(&handle, counter, size, 0, 0); + + if (ret) + return; + + mmap_event->event.pid = perf_counter_pid(counter, current); + mmap_event->event.tid = perf_counter_tid(counter, current); + + perf_output_put(&handle, mmap_event->event); + perf_output_copy(&handle, mmap_event->file_name, + mmap_event->file_size); + perf_output_end(&handle); +} + +static int perf_counter_mmap_match(struct perf_counter *counter, + struct perf_mmap_event *mmap_event) +{ + if (counter->attr.mmap) + return 1; + + return 0; +} + +static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, + struct perf_mmap_event *mmap_event) +{ + struct perf_counter *counter; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_counter_mmap_match(counter, mmap_event)) + perf_counter_mmap_output(counter, mmap_event); + } + rcu_read_unlock(); +} + +static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_counter_context *ctx; + struct vm_area_struct *vma = mmap_event->vma; + struct file *file = vma->vm_file; + unsigned int size; + char tmp[16]; + char *buf = NULL; + const char *name; + + if (file) { + buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + name = strncpy(tmp, "//enomem", sizeof(tmp)); + goto got_name; + } + name = d_path(&file->f_path, buf, PATH_MAX); + if (IS_ERR(name)) { + name = strncpy(tmp, "//toolong", sizeof(tmp)); + goto got_name; + } + } else { + name = arch_vma_name(mmap_event->vma); + if (name) + goto got_name; + + if (!vma->vm_mm) { + name = strncpy(tmp, "[vdso]", sizeof(tmp)); + goto got_name; + } + + name = strncpy(tmp, "//anon", sizeof(tmp)); + goto got_name; + } + +got_name: + size = ALIGN(strlen(name)+1, sizeof(u64)); + + mmap_event->file_name = name; + mmap_event->file_size = size; + + mmap_event->event.header.size = sizeof(mmap_event->event) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_counter_ctxp); + if (ctx) + perf_counter_mmap_ctx(ctx, mmap_event); + rcu_read_unlock(); + + kfree(buf); +} + +void __perf_counter_mmap(struct vm_area_struct *vma) +{ + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_mmap_counters)) + return; + + mmap_event = (struct perf_mmap_event){ + .vma = vma, + .event = { + .header = { .type = PERF_EVENT_MMAP, }, + .start = vma->vm_start, + .len = vma->vm_end - vma->vm_start, + .pgoff = vma->vm_pgoff, + }, + }; + + perf_counter_mmap_event(&mmap_event); +} + +/* + * Log sample_period changes so that analyzing tools can re-normalize the + * event flow. + */ + +struct freq_event { + struct perf_event_header header; + u64 time; + u64 id; + u64 period; +}; + +static void perf_log_period(struct perf_counter *counter, u64 period) +{ + struct perf_output_handle handle; + struct freq_event event; + int ret; + + if (counter->hw.sample_period == period) + return; + + if (counter->attr.sample_type & PERF_SAMPLE_PERIOD) + return; + + event = (struct freq_event) { + .header = { + .type = PERF_EVENT_PERIOD, + .misc = 0, + .size = sizeof(event), + }, + .time = sched_clock(), + .id = counter->id, + .period = period, + }; + + ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0); + if (ret) + return; + + perf_output_put(&handle, event); + perf_output_end(&handle); +} + +/* + * IRQ throttle logging + */ + +static void perf_log_throttle(struct perf_counter *counter, int enable) +{ + struct perf_output_handle handle; + int ret; + + struct { + struct perf_event_header header; + u64 time; + u64 id; + } throttle_event = { + .header = { + .type = PERF_EVENT_THROTTLE + 1, + .misc = 0, + .size = sizeof(throttle_event), + }, + .time = sched_clock(), + .id = counter->id, + }; + + ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); + if (ret) + return; + + perf_output_put(&handle, throttle_event); + perf_output_end(&handle); +} + +/* + * Generic counter overflow handling. + */ + +int perf_counter_overflow(struct perf_counter *counter, int nmi, + struct perf_sample_data *data) +{ + int events = atomic_read(&counter->event_limit); + int throttle = counter->pmu->unthrottle != NULL; + struct hw_perf_counter *hwc = &counter->hw; + int ret = 0; + + if (!throttle) { + hwc->interrupts++; + } else { + if (hwc->interrupts != MAX_INTERRUPTS) { + hwc->interrupts++; + if (HZ * hwc->interrupts > + (u64)sysctl_perf_counter_sample_rate) { + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(counter, 0); + ret = 1; + } + } else { + /* + * Keep re-disabling counters even though on the previous + * pass we disabled it - just in case we raced with a + * sched-in and the counter got enabled again: + */ + ret = 1; + } + } + + if (counter->attr.freq) { + u64 now = sched_clock(); + s64 delta = now - hwc->freq_stamp; + + hwc->freq_stamp = now; + + if (delta > 0 && delta < TICK_NSEC) + perf_adjust_period(counter, NSEC_PER_SEC / (int)delta); + } + + /* + * XXX event_limit might not quite work as expected on inherited + * counters + */ + + counter->pending_kill = POLL_IN; + if (events && atomic_dec_and_test(&counter->event_limit)) { + ret = 1; + counter->pending_kill = POLL_HUP; + if (nmi) { + counter->pending_disable = 1; + perf_pending_queue(&counter->pending, + perf_pending_counter); + } else + perf_counter_disable(counter); + } + + perf_counter_output(counter, nmi, data); + return ret; +} + +/* + * Generic software counter infrastructure + */ + +static void perf_swcounter_update(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + u64 prev, now; + s64 delta; + +again: + prev = atomic64_read(&hwc->prev_count); + now = atomic64_read(&hwc->count); + if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) + goto again; + + delta = now - prev; + + atomic64_add(delta, &counter->count); + atomic64_sub(delta, &hwc->period_left); +} + +static void perf_swcounter_set_period(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + s64 left = atomic64_read(&hwc->period_left); + s64 period = hwc->sample_period; + + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_add(period, &hwc->period_left); + hwc->last_period = period; + } + + atomic64_set(&hwc->prev_count, -left); + atomic64_set(&hwc->count, -left); +} + +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct perf_counter *counter; + u64 period; + + counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); + counter->pmu->read(counter); + + data.addr = 0; + data.regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((counter->attr.exclude_kernel || !data.regs) && + !counter->attr.exclude_user) + data.regs = task_pt_regs(current); + + if (data.regs) { + if (perf_counter_overflow(counter, 0, &data)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, counter->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + +static void perf_swcounter_overflow(struct perf_counter *counter, + int nmi, struct pt_regs *regs, u64 addr) +{ + struct perf_sample_data data = { + .regs = regs, + .addr = addr, + .period = counter->hw.last_period, + }; + + perf_swcounter_update(counter); + perf_swcounter_set_period(counter); + if (perf_counter_overflow(counter, nmi, &data)) + /* soft-disable the counter */ + ; + +} + +static int perf_swcounter_is_counting(struct perf_counter *counter) +{ + struct perf_counter_context *ctx; + unsigned long flags; + int count; + + if (counter->state == PERF_COUNTER_STATE_ACTIVE) + return 1; + + if (counter->state != PERF_COUNTER_STATE_INACTIVE) + return 0; + + /* + * If the counter is inactive, it could be just because + * its task is scheduled out, or because it's in a group + * which could not go on the PMU. We want to count in + * the first case but not the second. If the context is + * currently active then an inactive software counter must + * be the second case. If it's not currently active then + * we need to know whether the counter was active when the + * context was last active, which we can determine by + * comparing counter->tstamp_stopped with ctx->time. + * + * We are within an RCU read-side critical section, + * which protects the existence of *ctx. + */ + ctx = counter->ctx; + spin_lock_irqsave(&ctx->lock, flags); + count = 1; + /* Re-check state now we have the lock */ + if (counter->state < PERF_COUNTER_STATE_INACTIVE || + counter->ctx->is_active || + counter->tstamp_stopped < ctx->time) + count = 0; + spin_unlock_irqrestore(&ctx->lock, flags); + return count; +} + +static int perf_swcounter_match(struct perf_counter *counter, + enum perf_type_id type, + u32 event, struct pt_regs *regs) +{ + if (!perf_swcounter_is_counting(counter)) + return 0; + + if (counter->attr.type != type) + return 0; + if (counter->attr.config != event) + return 0; + + if (regs) { + if (counter->attr.exclude_user && user_mode(regs)) + return 0; + + if (counter->attr.exclude_kernel && !user_mode(regs)) + return 0; + } + + return 1; +} + +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, + int nmi, struct pt_regs *regs, u64 addr) +{ + int neg = atomic64_add_negative(nr, &counter->hw.count); + + if (counter->hw.sample_period && !neg && regs) + perf_swcounter_overflow(counter, nmi, regs, addr); +} + +static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, + enum perf_type_id type, u32 event, + u64 nr, int nmi, struct pt_regs *regs, + u64 addr) +{ + struct perf_counter *counter; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_swcounter_match(counter, type, event, regs)) + perf_swcounter_add(counter, nr, nmi, regs, addr); + } + rcu_read_unlock(); +} + +static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) +{ + if (in_nmi()) + return &cpuctx->recursion[3]; + + if (in_irq()) + return &cpuctx->recursion[2]; + + if (in_softirq()) + return &cpuctx->recursion[1]; + + return &cpuctx->recursion[0]; +} + +static void __perf_swcounter_event(enum perf_type_id type, u32 event, + u64 nr, int nmi, struct pt_regs *regs, + u64 addr) +{ + struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + int *recursion = perf_swcounter_recursion_context(cpuctx); + struct perf_counter_context *ctx; + + if (*recursion) + goto out; + + (*recursion)++; + barrier(); + + perf_swcounter_ctx_event(&cpuctx->ctx, type, event, + nr, nmi, regs, addr); + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_counter_ctxp); + if (ctx) + perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr); + rcu_read_unlock(); + + barrier(); + (*recursion)--; + +out: + put_cpu_var(perf_cpu_context); +} + +void +perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) +{ + __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr); +} + +static void perf_swcounter_read(struct perf_counter *counter) +{ + perf_swcounter_update(counter); +} + +static int perf_swcounter_enable(struct perf_counter *counter) +{ + perf_swcounter_set_period(counter); + return 0; +} + +static void perf_swcounter_disable(struct perf_counter *counter) +{ + perf_swcounter_update(counter); +} + +static const struct pmu perf_ops_generic = { + .enable = perf_swcounter_enable, + .disable = perf_swcounter_disable, + .read = perf_swcounter_read, +}; + +/* + * Software counter: cpu wall time clock + */ + +static void cpu_clock_perf_counter_update(struct perf_counter *counter) +{ + int cpu = raw_smp_processor_id(); + s64 prev; + u64 now; + + now = cpu_clock(cpu); + prev = atomic64_read(&counter->hw.prev_count); + atomic64_set(&counter->hw.prev_count, now); + atomic64_add(now - prev, &counter->count); +} + +static int cpu_clock_perf_counter_enable(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + int cpu = raw_smp_processor_id(); + + atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swcounter_hrtimer; + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + +static void cpu_clock_perf_counter_disable(struct perf_counter *counter) +{ + if (counter->hw.sample_period) + hrtimer_cancel(&counter->hw.hrtimer); + cpu_clock_perf_counter_update(counter); +} + +static void cpu_clock_perf_counter_read(struct perf_counter *counter) +{ + cpu_clock_perf_counter_update(counter); +} + +static const struct pmu perf_ops_cpu_clock = { + .enable = cpu_clock_perf_counter_enable, + .disable = cpu_clock_perf_counter_disable, + .read = cpu_clock_perf_counter_read, +}; + +/* + * Software counter: task time clock + */ + +static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) +{ + u64 prev; + s64 delta; + + prev = atomic64_xchg(&counter->hw.prev_count, now); + delta = now - prev; + atomic64_add(delta, &counter->count); +} + +static int task_clock_perf_counter_enable(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + u64 now; + + now = counter->ctx->time; + + atomic64_set(&hwc->prev_count, now); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swcounter_hrtimer; + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + +static void task_clock_perf_counter_disable(struct perf_counter *counter) +{ + if (counter->hw.sample_period) + hrtimer_cancel(&counter->hw.hrtimer); + task_clock_perf_counter_update(counter, counter->ctx->time); + +} + +static void task_clock_perf_counter_read(struct perf_counter *counter) +{ + u64 time; + + if (!in_nmi()) { + update_context_time(counter->ctx); + time = counter->ctx->time; + } else { + u64 now = perf_clock(); + u64 delta = now - counter->ctx->timestamp; + time = counter->ctx->time + delta; + } + + task_clock_perf_counter_update(counter, time); +} + +static const struct pmu perf_ops_task_clock = { + .enable = task_clock_perf_counter_enable, + .disable = task_clock_perf_counter_disable, + .read = task_clock_perf_counter_read, +}; + +/* + * Software counter: cpu migrations + */ +void perf_counter_task_migration(struct task_struct *task, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx; + + perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE, + PERF_COUNT_SW_CPU_MIGRATIONS, + 1, 1, NULL, 0); + + ctx = perf_pin_task_context(task); + if (ctx) { + perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE, + PERF_COUNT_SW_CPU_MIGRATIONS, + 1, 1, NULL, 0); + perf_unpin_context(ctx); + } +} + +#ifdef CONFIG_EVENT_PROFILE +void perf_tpcounter_event(int event_id) +{ + struct pt_regs *regs = get_irq_regs(); + + if (!regs) + regs = task_pt_regs(current); + + __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); +} +EXPORT_SYMBOL_GPL(perf_tpcounter_event); + +extern int ftrace_profile_enable(int); +extern void ftrace_profile_disable(int); + +static void tp_perf_counter_destroy(struct perf_counter *counter) +{ + ftrace_profile_disable(perf_event_id(&counter->attr)); +} + +static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) +{ + int event_id = perf_event_id(&counter->attr); + int ret; + + ret = ftrace_profile_enable(event_id); + if (ret) + return NULL; + + counter->destroy = tp_perf_counter_destroy; + + return &perf_ops_generic; +} +#else +static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) +{ + return NULL; +} +#endif + +static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) +{ + const struct pmu *pmu = NULL; + + /* + * Software counters (currently) can't in general distinguish + * between user, kernel and hypervisor events. + * However, context switches and cpu migrations are considered + * to be kernel events, and page faults are never hypervisor + * events. + */ + switch (counter->attr.config) { + case PERF_COUNT_SW_CPU_CLOCK: + pmu = &perf_ops_cpu_clock; + + break; + case PERF_COUNT_SW_TASK_CLOCK: + /* + * If the user instantiates this as a per-cpu counter, + * use the cpu_clock counter instead. + */ + if (counter->ctx->task) + pmu = &perf_ops_task_clock; + else + pmu = &perf_ops_cpu_clock; + + break; + case PERF_COUNT_SW_PAGE_FAULTS: + case PERF_COUNT_SW_PAGE_FAULTS_MIN: + case PERF_COUNT_SW_PAGE_FAULTS_MAJ: + case PERF_COUNT_SW_CONTEXT_SWITCHES: + case PERF_COUNT_SW_CPU_MIGRATIONS: + pmu = &perf_ops_generic; + break; + } + + return pmu; +} + +/* + * Allocate and initialize a counter structure + */ +static struct perf_counter * +perf_counter_alloc(struct perf_counter_attr *attr, + int cpu, + struct perf_counter_context *ctx, + struct perf_counter *group_leader, + gfp_t gfpflags) +{ + const struct pmu *pmu; + struct perf_counter *counter; + struct hw_perf_counter *hwc; + long err; + + counter = kzalloc(sizeof(*counter), gfpflags); + if (!counter) + return ERR_PTR(-ENOMEM); + + /* + * Single counters are their own group leaders, with an + * empty sibling list: + */ + if (!group_leader) + group_leader = counter; + + mutex_init(&counter->child_mutex); + INIT_LIST_HEAD(&counter->child_list); + + INIT_LIST_HEAD(&counter->list_entry); + INIT_LIST_HEAD(&counter->event_entry); + INIT_LIST_HEAD(&counter->sibling_list); + init_waitqueue_head(&counter->waitq); + + mutex_init(&counter->mmap_mutex); + + counter->cpu = cpu; + counter->attr = *attr; + counter->group_leader = group_leader; + counter->pmu = NULL; + counter->ctx = ctx; + counter->oncpu = -1; + + counter->ns = get_pid_ns(current->nsproxy->pid_ns); + counter->id = atomic64_inc_return(&perf_counter_id); + + counter->state = PERF_COUNTER_STATE_INACTIVE; + + if (attr->disabled) + counter->state = PERF_COUNTER_STATE_OFF; + + pmu = NULL; + + hwc = &counter->hw; + hwc->sample_period = attr->sample_period; + if (attr->freq && attr->sample_freq) + hwc->sample_period = 1; + + atomic64_set(&hwc->period_left, hwc->sample_period); + + /* + * we currently do not support PERF_SAMPLE_GROUP on inherited counters + */ + if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) + goto done; + + switch (attr->type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + pmu = hw_perf_counter_init(counter); + break; + + case PERF_TYPE_SOFTWARE: + pmu = sw_perf_counter_init(counter); + break; + + case PERF_TYPE_TRACEPOINT: + pmu = tp_perf_counter_init(counter); + break; + + default: + break; + } +done: + err = 0; + if (!pmu) + err = -EINVAL; + else if (IS_ERR(pmu)) + err = PTR_ERR(pmu); + + if (err) { + if (counter->ns) + put_pid_ns(counter->ns); + kfree(counter); + return ERR_PTR(err); + } + + counter->pmu = pmu; + + atomic_inc(&nr_counters); + if (counter->attr.mmap) + atomic_inc(&nr_mmap_counters); + if (counter->attr.comm) + atomic_inc(&nr_comm_counters); + + return counter; +} + +static int perf_copy_attr(struct perf_counter_attr __user *uattr, + struct perf_counter_attr *attr) +{ + int ret; + u32 size; + + if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = PERF_ATTR_SIZE_VER0; + + if (size < PERF_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0. + */ + if (size > sizeof(*attr)) { + unsigned long val; + unsigned long __user *addr; + unsigned long __user *end; + + addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr), + sizeof(unsigned long)); + end = PTR_ALIGN((void __user *)uattr + size, + sizeof(unsigned long)); + + for (; addr < end; addr += sizeof(unsigned long)) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * If the type exists, the corresponding creation will verify + * the attr->config. + */ + if (attr->type >= PERF_TYPE_MAX) + return -EINVAL; + + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) + return -EINVAL; + + if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) + return -EINVAL; + + if (attr->read_format & ~(PERF_FORMAT_MAX-1)) + return -EINVAL; + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + +/** + * sys_perf_counter_open - open a performance counter, associate it to a task/cpu + * + * @attr_uptr: event type attributes for monitoring/sampling + * @pid: target pid + * @cpu: target cpu + * @group_fd: group leader counter fd + */ +SYSCALL_DEFINE5(perf_counter_open, + struct perf_counter_attr __user *, attr_uptr, + pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) +{ + struct perf_counter *counter, *group_leader; + struct perf_counter_attr attr; + struct perf_counter_context *ctx; + struct file *counter_file = NULL; + struct file *group_file = NULL; + int fput_needed = 0; + int fput_needed2 = 0; + int ret; + + /* for future expandability... */ + if (flags) + return -EINVAL; + + ret = perf_copy_attr(attr_uptr, &attr); + if (ret) + return ret; + + if (!attr.exclude_kernel) { + if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + } + + if (attr.freq) { + if (attr.sample_freq > sysctl_perf_counter_sample_rate) + return -EINVAL; + } + + /* + * Get the target context (task or percpu): + */ + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + /* + * Look up the group leader (we will attach this counter to it): + */ + group_leader = NULL; + if (group_fd != -1) { + ret = -EINVAL; + group_file = fget_light(group_fd, &fput_needed); + if (!group_file) + goto err_put_context; + if (group_file->f_op != &perf_fops) + goto err_put_context; + + group_leader = group_file->private_data; + /* + * Do not allow a recursive hierarchy (this new sibling + * becoming part of another group-sibling): + */ + if (group_leader->group_leader != group_leader) + goto err_put_context; + /* + * Do not allow to attach to a group in a different + * task or CPU context: + */ + if (group_leader->ctx != ctx) + goto err_put_context; + /* + * Only a group leader can be exclusive or pinned + */ + if (attr.exclusive || attr.pinned) + goto err_put_context; + } + + counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, + GFP_KERNEL); + ret = PTR_ERR(counter); + if (IS_ERR(counter)) + goto err_put_context; + + ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); + if (ret < 0) + goto err_free_put_context; + + counter_file = fget_light(ret, &fput_needed2); + if (!counter_file) + goto err_free_put_context; + + counter->filp = counter_file; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_install_in_context(ctx, counter, cpu); + ++ctx->generation; + mutex_unlock(&ctx->mutex); + + counter->owner = current; + get_task_struct(current); + mutex_lock(¤t->perf_counter_mutex); + list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); + mutex_unlock(¤t->perf_counter_mutex); + + fput_light(counter_file, fput_needed2); + +out_fput: + fput_light(group_file, fput_needed); + + return ret; + +err_free_put_context: + kfree(counter); + +err_put_context: + put_ctx(ctx); + + goto out_fput; +} + +/* + * inherit a counter from parent task to child task: + */ +static struct perf_counter * +inherit_counter(struct perf_counter *parent_counter, + struct task_struct *parent, + struct perf_counter_context *parent_ctx, + struct task_struct *child, + struct perf_counter *group_leader, + struct perf_counter_context *child_ctx) +{ + struct perf_counter *child_counter; + + /* + * Instead of creating recursive hierarchies of counters, + * we link inherited counters back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_counter->parent) + parent_counter = parent_counter->parent; + + child_counter = perf_counter_alloc(&parent_counter->attr, + parent_counter->cpu, child_ctx, + group_leader, GFP_KERNEL); + if (IS_ERR(child_counter)) + return child_counter; + get_ctx(child_ctx); + + /* + * Make the child state follow the state of the parent counter, + * not its attr.disabled bit. We hold the parent's mutex, + * so we won't race with perf_counter_{en, dis}able_family. + */ + if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) + child_counter->state = PERF_COUNTER_STATE_INACTIVE; + else + child_counter->state = PERF_COUNTER_STATE_OFF; + + if (parent_counter->attr.freq) + child_counter->hw.sample_period = parent_counter->hw.sample_period; + + /* + * Link it up in the child's context: + */ + add_counter_to_ctx(child_counter, child_ctx); + + child_counter->parent = parent_counter; + /* + * inherit into child's child as well: + */ + child_counter->attr.inherit = 1; + + /* + * Get a reference to the parent filp - we will fput it + * when the child counter exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_counter->filp->f_count); + + /* + * Link this into the parent counter's child list + */ + WARN_ON_ONCE(parent_counter->ctx->parent_ctx); + mutex_lock(&parent_counter->child_mutex); + list_add_tail(&child_counter->child_list, &parent_counter->child_list); + mutex_unlock(&parent_counter->child_mutex); + + return child_counter; +} + +static int inherit_group(struct perf_counter *parent_counter, + struct task_struct *parent, + struct perf_counter_context *parent_ctx, + struct task_struct *child, + struct perf_counter_context *child_ctx) +{ + struct perf_counter *leader; + struct perf_counter *sub; + struct perf_counter *child_ctr; + + leader = inherit_counter(parent_counter, parent, parent_ctx, + child, NULL, child_ctx); + if (IS_ERR(leader)) + return PTR_ERR(leader); + list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { + child_ctr = inherit_counter(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); + } + return 0; +} + +static void sync_child_counter(struct perf_counter *child_counter, + struct perf_counter *parent_counter) +{ + u64 child_val; + + child_val = atomic64_read(&child_counter->count); + + /* + * Add back the child's count to the parent's count: + */ + atomic64_add(child_val, &parent_counter->count); + atomic64_add(child_counter->total_time_enabled, + &parent_counter->child_total_time_enabled); + atomic64_add(child_counter->total_time_running, + &parent_counter->child_total_time_running); + + /* + * Remove this counter from the parent's list + */ + WARN_ON_ONCE(parent_counter->ctx->parent_ctx); + mutex_lock(&parent_counter->child_mutex); + list_del_init(&child_counter->child_list); + mutex_unlock(&parent_counter->child_mutex); + + /* + * Release the parent counter, if this was the last + * reference to it. + */ + fput(parent_counter->filp); +} + +static void +__perf_counter_exit_task(struct perf_counter *child_counter, + struct perf_counter_context *child_ctx) +{ + struct perf_counter *parent_counter; + + update_counter_times(child_counter); + perf_counter_remove_from_context(child_counter); + + parent_counter = child_counter->parent; + /* + * It can happen that parent exits first, and has counters + * that are still around due to the child reference. These + * counters need to be zapped - but otherwise linger. + */ + if (parent_counter) { + sync_child_counter(child_counter, parent_counter); + free_counter(child_counter); + } +} + +/* + * When a child task exits, feed back counter values to parent counters. + */ +void perf_counter_exit_task(struct task_struct *child) +{ + struct perf_counter *child_counter, *tmp; + struct perf_counter_context *child_ctx; + unsigned long flags; + + if (likely(!child->perf_counter_ctxp)) + return; + + local_irq_save(flags); + /* + * We can't reschedule here because interrupts are disabled, + * and either child is current or it is a task that can't be + * scheduled, so we are now safe from rescheduling changing + * our context. + */ + child_ctx = child->perf_counter_ctxp; + __perf_counter_task_sched_out(child_ctx); + + /* + * Take the context lock here so that if find_get_context is + * reading child->perf_counter_ctxp, we wait until it has + * incremented the context's refcount before we do put_ctx below. + */ + spin_lock(&child_ctx->lock); + child->perf_counter_ctxp = NULL; + if (child_ctx->parent_ctx) { + /* + * This context is a clone; unclone it so it can't get + * swapped to another process while we're removing all + * the counters from it. + */ + put_ctx(child_ctx->parent_ctx); + child_ctx->parent_ctx = NULL; + } + spin_unlock(&child_ctx->lock); + local_irq_restore(flags); + + /* + * We can recurse on the same lock type through: + * + * __perf_counter_exit_task() + * sync_child_counter() + * fput(parent_counter->filp) + * perf_release() + * mutex_lock(&ctx->mutex) + * + * But since its the parent context it won't be the same instance. + */ + mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); + +again: + list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, + list_entry) + __perf_counter_exit_task(child_counter, child_ctx); + + /* + * If the last counter was a group counter, it will have appended all + * its siblings to the list, but we obtained 'tmp' before that which + * will still point to the list head terminating the iteration. + */ + if (!list_empty(&child_ctx->counter_list)) + goto again; + + mutex_unlock(&child_ctx->mutex); + + put_ctx(child_ctx); +} + +/* + * free an unexposed, unused context as created by inheritance by + * init_task below, used by fork() in case of fail. + */ +void perf_counter_free_task(struct task_struct *task) +{ + struct perf_counter_context *ctx = task->perf_counter_ctxp; + struct perf_counter *counter, *tmp; + + if (!ctx) + return; + + mutex_lock(&ctx->mutex); +again: + list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) { + struct perf_counter *parent = counter->parent; + + if (WARN_ON_ONCE(!parent)) + continue; + + mutex_lock(&parent->child_mutex); + list_del_init(&counter->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + list_del_counter(counter, ctx); + free_counter(counter); + } + + if (!list_empty(&ctx->counter_list)) + goto again; + + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); +} + +/* + * Initialize the perf_counter context in task_struct + */ +int perf_counter_init_task(struct task_struct *child) +{ + struct perf_counter_context *child_ctx, *parent_ctx; + struct perf_counter_context *cloned_ctx; + struct perf_counter *counter; + struct task_struct *parent = current; + int inherited_all = 1; + int ret = 0; + + child->perf_counter_ctxp = NULL; + + mutex_init(&child->perf_counter_mutex); + INIT_LIST_HEAD(&child->perf_counter_list); + + if (likely(!parent->perf_counter_ctxp)) + return 0; + + /* + * This is executed from the parent task context, so inherit + * counters that have been marked for cloning. + * First allocate and initialize a context for the child. + */ + + child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); + if (!child_ctx) + return -ENOMEM; + + __perf_counter_init_context(child_ctx, child); + child->perf_counter_ctxp = child_ctx; + get_task_struct(child); + + /* + * If the parent's context is a clone, pin it so it won't get + * swapped under us. + */ + parent_ctx = perf_pin_task_context(parent); + + /* + * No need to check if parent_ctx != NULL here; since we saw + * it non-NULL earlier, the only reason for it to become NULL + * is if we exit, and since we're currently in the middle of + * a fork we can't be exiting at the same time. + */ + + /* + * Lock the parent list. No need to lock the child - not PID + * hashed yet and not running, so nobody can access it. + */ + mutex_lock(&parent_ctx->mutex); + + /* + * We dont have to disable NMIs - we are only looking at + * the list, not manipulating it: + */ + list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) { + if (counter != counter->group_leader) + continue; + + if (!counter->attr.inherit) { + inherited_all = 0; + continue; + } + + ret = inherit_group(counter, parent, parent_ctx, + child, child_ctx); + if (ret) { + inherited_all = 0; + break; + } + } + + if (inherited_all) { + /* + * Mark the child context as a clone of the parent + * context, or of whatever the parent is a clone of. + * Note that if the parent is a clone, it could get + * uncloned at any point, but that doesn't matter + * because the list of counters and the generation + * count can't have changed since we took the mutex. + */ + cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); + if (cloned_ctx) { + child_ctx->parent_ctx = cloned_ctx; + child_ctx->parent_gen = parent_ctx->parent_gen; + } else { + child_ctx->parent_ctx = parent_ctx; + child_ctx->parent_gen = parent_ctx->generation; + } + get_ctx(child_ctx->parent_ctx); + } + + mutex_unlock(&parent_ctx->mutex); + + perf_unpin_context(parent_ctx); + + return ret; +} + +static void __cpuinit perf_counter_init_cpu(int cpu) +{ + struct perf_cpu_context *cpuctx; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + __perf_counter_init_context(&cpuctx->ctx, NULL); + + spin_lock(&perf_resource_lock); + cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; + spin_unlock(&perf_resource_lock); + + hw_perf_counter_setup(cpu); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __perf_counter_exit_cpu(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter_context *ctx = &cpuctx->ctx; + struct perf_counter *counter, *tmp; + + list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) + __perf_counter_remove_from_context(counter); +} +static void perf_counter_exit_cpu(int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx = &cpuctx->ctx; + + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); + mutex_unlock(&ctx->mutex); +} +#else +static inline void perf_counter_exit_cpu(int cpu) { } +#endif + +static int __cpuinit +perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action) { + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + perf_counter_init_cpu(cpu); + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + perf_counter_exit_cpu(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +/* + * This has to have a higher priority than migration_notifier in sched.c. + */ +static struct notifier_block __cpuinitdata perf_cpu_nb = { + .notifier_call = perf_cpu_notify, + .priority = 20, +}; + +void __init perf_counter_init(void) +{ + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&perf_cpu_nb); +} + +static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_reserved_percpu); +} + +static ssize_t +perf_set_reserve_percpu(struct sysdev_class *class, + const char *buf, + size_t count) +{ + struct perf_cpu_context *cpuctx; + unsigned long val; + int err, cpu, mpt; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > perf_max_counters) + return -EINVAL; + + spin_lock(&perf_resource_lock); + perf_reserved_percpu = val; + for_each_online_cpu(cpu) { + cpuctx = &per_cpu(perf_cpu_context, cpu); + spin_lock_irq(&cpuctx->ctx.lock); + mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, + perf_max_counters - perf_reserved_percpu); + cpuctx->max_pertask = mpt; + spin_unlock_irq(&cpuctx->ctx.lock); + } + spin_unlock(&perf_resource_lock); + + return count; +} + +static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_overcommit); +} + +static ssize_t +perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) +{ + unsigned long val; + int err; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > 1) + return -EINVAL; + + spin_lock(&perf_resource_lock); + perf_overcommit = val; + spin_unlock(&perf_resource_lock); + + return count; +} + +static SYSDEV_CLASS_ATTR( + reserve_percpu, + 0644, + perf_show_reserve_percpu, + perf_set_reserve_percpu + ); + +static SYSDEV_CLASS_ATTR( + overcommit, + 0644, + perf_show_overcommit, + perf_set_overcommit + ); + +static struct attribute *perfclass_attrs[] = { + &attr_reserve_percpu.attr, + &attr_overcommit.attr, + NULL +}; + +static struct attribute_group perfclass_attr_group = { + .attrs = perfclass_attrs, + .name = "perf_counters", +}; + +static int __init perf_counter_sysfs_init(void) +{ + return sysfs_create_group(&cpu_sysdev_class.kset.kobj, + &perfclass_attr_group); +} +device_initcall(perf_counter_sysfs_init); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 23bd4da..72067cb 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -116,9 +116,13 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. +config HIBERNATION_NVS + bool + config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE + select HIBERNATION_NVS if HAS_IOMEM ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 720ea4f..c3b81c3 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -6,6 +6,9 @@ endif obj-$(CONFIG_PM) += main.o obj-$(CONFIG_PM_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o -obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o +obj-$(CONFIG_SUSPEND) += suspend.o +obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o +obj-$(CONFIG_HIBERNATION) += swsusp.o hibernate.o snapshot.o swap.o user.o +obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/disk.c b/kernel/power/hibernate.c index e71ca9c..81d2e74 100644 --- a/kernel/power/disk.c +++ b/kernel/power/hibernate.c @@ -1,12 +1,12 @@ /* - * kernel/power/disk.c - Suspend-to-disk support. + * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. * * This file is released under the GPLv2. - * */ #include <linux/suspend.h> @@ -215,19 +215,17 @@ static int create_image(int platform_mode) if (error) return error; - device_pm_lock(); - - /* At this point, device_suspend() has been called, but *not* - * device_power_down(). We *must* call device_power_down() now. + /* At this point, dpm_suspend_start() has been called, but *not* + * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now. * Otherwise, drivers for some devices (e.g. interrupt controllers) * become desynchronized with the actual state of the hardware * at resume time, and evil weirdness ensues. */ - error = device_power_down(PMSG_FREEZE); + error = dpm_suspend_noirq(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); - goto Unlock; + return error; } error = platform_pre_snapshot(platform_mode); @@ -241,9 +239,9 @@ static int create_image(int platform_mode) local_irq_disable(); - sysdev_suspend(PMSG_FREEZE); + error = sysdev_suspend(PMSG_FREEZE); if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " + printk(KERN_ERR "PM: Some system devices failed to power down, " "aborting hibernation\n"); goto Enable_irqs; } @@ -264,7 +262,7 @@ static int create_image(int platform_mode) Power_up: sysdev_resume(); - /* NOTE: device_power_up() is just a resume() for devices + /* NOTE: dpm_resume_noirq() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ @@ -277,12 +275,9 @@ static int create_image(int platform_mode) Platform_finish: platform_finish(platform_mode); - device_power_up(in_suspend ? + dpm_resume_noirq(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - Unlock: - device_pm_unlock(); - return error; } @@ -309,7 +304,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); - error = device_suspend(PMSG_FREEZE); + error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Recover_platform; @@ -320,7 +315,7 @@ int hibernation_snapshot(int platform_mode) /* Control returns here after successful restore */ Resume_devices: - device_resume(in_suspend ? + dpm_resume_end(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); resume_console(); Close: @@ -344,13 +339,11 @@ static int resume_target_kernel(bool platform_mode) { int error; - device_pm_lock(); - - error = device_power_down(PMSG_QUIESCE); + error = dpm_suspend_noirq(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); - goto Unlock; + return error; } error = platform_pre_restore(platform_mode); @@ -401,10 +394,7 @@ static int resume_target_kernel(bool platform_mode) Cleanup: platform_restore_cleanup(platform_mode); - device_power_up(PMSG_RECOVER); - - Unlock: - device_pm_unlock(); + dpm_resume_noirq(PMSG_RECOVER); return error; } @@ -424,10 +414,10 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); - error = device_suspend(PMSG_QUIESCE); + error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); - device_resume(PMSG_RECOVER); + dpm_resume_end(PMSG_RECOVER); } resume_console(); pm_restore_console(); @@ -457,18 +447,16 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); - error = device_suspend(PMSG_HIBERNATE); + error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) hibernation_ops->recover(); goto Resume_devices; } - device_pm_lock(); - - error = device_power_down(PMSG_HIBERNATE); + error = dpm_suspend_noirq(PMSG_HIBERNATE); if (error) - goto Unlock; + goto Resume_devices; error = hibernation_ops->prepare(); if (error) @@ -491,14 +479,11 @@ int hibernation_platform_enter(void) Platofrm_finish: hibernation_ops->finish(); - device_power_up(PMSG_RESTORE); - - Unlock: - device_pm_unlock(); + dpm_suspend_noirq(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; - device_resume(PMSG_RESTORE); + dpm_resume_end(PMSG_RESTORE); resume_console(); Close: diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c new file mode 100644 index 0000000..39ac698 --- /dev/null +++ b/kernel/power/hibernate_nvs.c @@ -0,0 +1,135 @@ +/* + * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory + * + * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/suspend.h> + +/* + * Platforms, like ACPI, may want us to save some memory used by them during + * hibernation and to restore the contents of this memory during the subsequent + * resume. The code below implements a mechanism allowing us to do that. + */ + +struct nvs_page { + unsigned long phys_start; + unsigned int size; + void *kaddr; + void *data; + struct list_head node; +}; + +static LIST_HEAD(nvs_list); + +/** + * hibernate_nvs_register - register platform NVS memory region to save + * @start - physical address of the region + * @size - size of the region + * + * The NVS region need not be page-aligned (both ends) and we arrange + * things so that the data from page-aligned addresses in this region will + * be copied into separate RAM pages. + */ +int hibernate_nvs_register(unsigned long start, unsigned long size) +{ + struct nvs_page *entry, *next; + + while (size > 0) { + unsigned int nr_bytes; + + entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); + if (!entry) + goto Error; + + list_add_tail(&entry->node, &nvs_list); + entry->phys_start = start; + nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); + entry->size = (size < nr_bytes) ? size : nr_bytes; + + start += entry->size; + size -= entry->size; + } + return 0; + + Error: + list_for_each_entry_safe(entry, next, &nvs_list, node) { + list_del(&entry->node); + kfree(entry); + } + return -ENOMEM; +} + +/** + * hibernate_nvs_free - free data pages allocated for saving NVS regions + */ +void hibernate_nvs_free(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + free_page((unsigned long)entry->data); + entry->data = NULL; + if (entry->kaddr) { + iounmap(entry->kaddr); + entry->kaddr = NULL; + } + } +} + +/** + * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions + */ +int hibernate_nvs_alloc(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) { + entry->data = (void *)__get_free_page(GFP_KERNEL); + if (!entry->data) { + hibernate_nvs_free(); + return -ENOMEM; + } + } + return 0; +} + +/** + * hibernate_nvs_save - save NVS memory regions + */ +void hibernate_nvs_save(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Saving platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + entry->kaddr = ioremap(entry->phys_start, entry->size); + memcpy(entry->data, entry->kaddr, entry->size); + } +} + +/** + * hibernate_nvs_restore - restore NVS memory regions + * + * This function is going to be called with interrupts disabled, so it + * cannot iounmap the virtual addresses used to access the NVS region. + */ +void hibernate_nvs_restore(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Restoring platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) + memcpy(entry->kaddr, entry->data, entry->size); +} diff --git a/kernel/power/main.c b/kernel/power/main.c index f99ed6a..f710e36 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -8,20 +8,9 @@ * */ -#include <linux/module.h> -#include <linux/suspend.h> #include <linux/kobject.h> #include <linux/string.h> -#include <linux/delay.h> -#include <linux/errno.h> -#include <linux/kmod.h> -#include <linux/init.h> -#include <linux/console.h> -#include <linux/cpu.h> #include <linux/resume-trace.h> -#include <linux/freezer.h> -#include <linux/vmstat.h> -#include <linux/syscalls.h> #include "power.h" @@ -119,378 +108,6 @@ power_attr(pm_test); #endif /* CONFIG_PM_SLEEP */ -#ifdef CONFIG_SUSPEND - -static int suspend_test(int level) -{ -#ifdef CONFIG_PM_DEBUG - if (pm_test_level == level) { - printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); - mdelay(5000); - return 1; - } -#endif /* !CONFIG_PM_DEBUG */ - return 0; -} - -#ifdef CONFIG_PM_TEST_SUSPEND - -/* - * We test the system suspend code by setting an RTC wakealarm a short - * time in the future, then suspending. Suspending the devices won't - * normally take long ... some systems only need a few milliseconds. - * - * The time it takes is system-specific though, so when we test this - * during system bootup we allow a LOT of time. - */ -#define TEST_SUSPEND_SECONDS 5 - -static unsigned long suspend_test_start_time; - -static void suspend_test_start(void) -{ - /* FIXME Use better timebase than "jiffies", ideally a clocksource. - * What we want is a hardware counter that will work correctly even - * during the irqs-are-off stages of the suspend/resume cycle... - */ - suspend_test_start_time = jiffies; -} - -static void suspend_test_finish(const char *label) -{ - long nj = jiffies - suspend_test_start_time; - unsigned msec; - - msec = jiffies_to_msecs(abs(nj)); - pr_info("PM: %s took %d.%03d seconds\n", label, - msec / 1000, msec % 1000); - - /* Warning on suspend means the RTC alarm period needs to be - * larger -- the system was sooo slooowwww to suspend that the - * alarm (should have) fired before the system went to sleep! - * - * Warning on either suspend or resume also means the system - * has some performance issues. The stack dump of a WARN_ON - * is more likely to get the right attention than a printk... - */ - WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); -} - -#else - -static void suspend_test_start(void) -{ -} - -static void suspend_test_finish(const char *label) -{ -} - -#endif - -/* This is just an arbitrary number */ -#define FREE_PAGE_NUMBER (100) - -static struct platform_suspend_ops *suspend_ops; - -/** - * suspend_set_ops - Set the global suspend method table. - * @ops: Pointer to ops structure. - */ - -void suspend_set_ops(struct platform_suspend_ops *ops) -{ - mutex_lock(&pm_mutex); - suspend_ops = ops; - mutex_unlock(&pm_mutex); -} - -/** - * suspend_valid_only_mem - generic memory-only valid callback - * - * Platform drivers that implement mem suspend only and only need - * to check for that in their .valid callback can use this instead - * of rolling their own .valid callback. - */ -int suspend_valid_only_mem(suspend_state_t state) -{ - return state == PM_SUSPEND_MEM; -} - -/** - * suspend_prepare - Do prep work before entering low-power state. - * - * This is common code that is called for each state that we're entering. - * Run suspend notifiers, allocate a console and stop all processes. - */ -static int suspend_prepare(void) -{ - int error; - unsigned int free_pages; - - if (!suspend_ops || !suspend_ops->enter) - return -EPERM; - - pm_prepare_console(); - - error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); - if (error) - goto Finish; - - error = usermodehelper_disable(); - if (error) - goto Finish; - - if (suspend_freeze_processes()) { - error = -EAGAIN; - goto Thaw; - } - - free_pages = global_page_state(NR_FREE_PAGES); - if (free_pages < FREE_PAGE_NUMBER) { - pr_debug("PM: free some memory\n"); - shrink_all_memory(FREE_PAGE_NUMBER - free_pages); - if (nr_free_pages() < FREE_PAGE_NUMBER) { - error = -ENOMEM; - printk(KERN_ERR "PM: No enough memory\n"); - } - } - if (!error) - return 0; - - Thaw: - suspend_thaw_processes(); - usermodehelper_enable(); - Finish: - pm_notifier_call_chain(PM_POST_SUSPEND); - pm_restore_console(); - return error; -} - -/* default implementation */ -void __attribute__ ((weak)) arch_suspend_disable_irqs(void) -{ - local_irq_disable(); -} - -/* default implementation */ -void __attribute__ ((weak)) arch_suspend_enable_irqs(void) -{ - local_irq_enable(); -} - -/** - * suspend_enter - enter the desired system sleep state. - * @state: state to enter - * - * This function should be called after devices have been suspended. - */ -static int suspend_enter(suspend_state_t state) -{ - int error; - - device_pm_lock(); - - if (suspend_ops->prepare) { - error = suspend_ops->prepare(); - if (error) - goto Done; - } - - error = device_power_down(PMSG_SUSPEND); - if (error) { - printk(KERN_ERR "PM: Some devices failed to power down\n"); - goto Platfrom_finish; - } - - if (suspend_ops->prepare_late) { - error = suspend_ops->prepare_late(); - if (error) - goto Power_up_devices; - } - - if (suspend_test(TEST_PLATFORM)) - goto Platform_wake; - - error = disable_nonboot_cpus(); - if (error || suspend_test(TEST_CPUS)) - goto Enable_cpus; - - arch_suspend_disable_irqs(); - BUG_ON(!irqs_disabled()); - - error = sysdev_suspend(PMSG_SUSPEND); - if (!error) { - if (!suspend_test(TEST_CORE)) - error = suspend_ops->enter(state); - sysdev_resume(); - } - - arch_suspend_enable_irqs(); - BUG_ON(irqs_disabled()); - - Enable_cpus: - enable_nonboot_cpus(); - - Platform_wake: - if (suspend_ops->wake) - suspend_ops->wake(); - - Power_up_devices: - device_power_up(PMSG_RESUME); - - Platfrom_finish: - if (suspend_ops->finish) - suspend_ops->finish(); - - Done: - device_pm_unlock(); - - return error; -} - -/** - * suspend_devices_and_enter - suspend devices and enter the desired system - * sleep state. - * @state: state to enter - */ -int suspend_devices_and_enter(suspend_state_t state) -{ - int error; - - if (!suspend_ops) - return -ENOSYS; - - if (suspend_ops->begin) { - error = suspend_ops->begin(state); - if (error) - goto Close; - } - suspend_console(); - suspend_test_start(); - error = device_suspend(PMSG_SUSPEND); - if (error) { - printk(KERN_ERR "PM: Some devices failed to suspend\n"); - goto Recover_platform; - } - suspend_test_finish("suspend devices"); - if (suspend_test(TEST_DEVICES)) - goto Recover_platform; - - suspend_enter(state); - - Resume_devices: - suspend_test_start(); - device_resume(PMSG_RESUME); - suspend_test_finish("resume devices"); - resume_console(); - Close: - if (suspend_ops->end) - suspend_ops->end(); - return error; - - Recover_platform: - if (suspend_ops->recover) - suspend_ops->recover(); - goto Resume_devices; -} - -/** - * suspend_finish - Do final work before exiting suspend sequence. - * - * Call platform code to clean up, restart processes, and free the - * console that we've allocated. This is not called for suspend-to-disk. - */ -static void suspend_finish(void) -{ - suspend_thaw_processes(); - usermodehelper_enable(); - pm_notifier_call_chain(PM_POST_SUSPEND); - pm_restore_console(); -} - - - - -static const char * const pm_states[PM_SUSPEND_MAX] = { - [PM_SUSPEND_STANDBY] = "standby", - [PM_SUSPEND_MEM] = "mem", -}; - -static inline int valid_state(suspend_state_t state) -{ - /* All states need lowlevel support and need to be valid - * to the lowlevel implementation, no valid callback - * implies that none are valid. */ - if (!suspend_ops || !suspend_ops->valid || !suspend_ops->valid(state)) - return 0; - return 1; -} - - -/** - * enter_state - Do common work of entering low-power state. - * @state: pm_state structure for state we're entering. - * - * Make sure we're the only ones trying to enter a sleep state. Fail - * if someone has beat us to it, since we don't want anything weird to - * happen when we wake up. - * Then, do the setup for suspend, enter the state, and cleaup (after - * we've woken up). - */ -static int enter_state(suspend_state_t state) -{ - int error; - - if (!valid_state(state)) - return -ENODEV; - - if (!mutex_trylock(&pm_mutex)) - return -EBUSY; - - printk(KERN_INFO "PM: Syncing filesystems ... "); - sys_sync(); - printk("done.\n"); - - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); - error = suspend_prepare(); - if (error) - goto Unlock; - - if (suspend_test(TEST_FREEZER)) - goto Finish; - - pr_debug("PM: Entering %s sleep\n", pm_states[state]); - error = suspend_devices_and_enter(state); - - Finish: - pr_debug("PM: Finishing wakeup.\n"); - suspend_finish(); - Unlock: - mutex_unlock(&pm_mutex); - return error; -} - - -/** - * pm_suspend - Externally visible function for suspending system. - * @state: Enumerated value of state to enter. - * - * Determine whether or not value is within range, get state - * structure, and enter (above). - */ - -int pm_suspend(suspend_state_t state) -{ - if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) - return enter_state(state); - return -EINVAL; -} - -EXPORT_SYMBOL(pm_suspend); - -#endif /* CONFIG_SUSPEND */ - struct kobject *power_kobj; /** @@ -503,7 +120,6 @@ struct kobject *power_kobj; * store() accepts one of those strings, translates it into the * proper enumerated value, and initiates a suspend transition. */ - static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -601,7 +217,6 @@ static struct attribute_group attr_group = { .attrs = g, }; - static int __init pm_init(void) { power_kobj = kobject_create_and_add("power", NULL); @@ -611,144 +226,3 @@ static int __init pm_init(void) } core_initcall(pm_init); - - -#ifdef CONFIG_PM_TEST_SUSPEND - -#include <linux/rtc.h> - -/* - * To test system suspend, we need a hands-off mechanism to resume the - * system. RTCs wake alarms are a common self-contained mechanism. - */ - -static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) -{ - static char err_readtime[] __initdata = - KERN_ERR "PM: can't read %s time, err %d\n"; - static char err_wakealarm [] __initdata = - KERN_ERR "PM: can't set %s wakealarm, err %d\n"; - static char err_suspend[] __initdata = - KERN_ERR "PM: suspend test failed, error %d\n"; - static char info_test[] __initdata = - KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; - - unsigned long now; - struct rtc_wkalrm alm; - int status; - - /* this may fail if the RTC hasn't been initialized */ - status = rtc_read_time(rtc, &alm.time); - if (status < 0) { - printk(err_readtime, dev_name(&rtc->dev), status); - return; - } - rtc_tm_to_time(&alm.time, &now); - - memset(&alm, 0, sizeof alm); - rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); - alm.enabled = true; - - status = rtc_set_alarm(rtc, &alm); - if (status < 0) { - printk(err_wakealarm, dev_name(&rtc->dev), status); - return; - } - - if (state == PM_SUSPEND_MEM) { - printk(info_test, pm_states[state]); - status = pm_suspend(state); - if (status == -ENODEV) - state = PM_SUSPEND_STANDBY; - } - if (state == PM_SUSPEND_STANDBY) { - printk(info_test, pm_states[state]); - status = pm_suspend(state); - } - if (status < 0) - printk(err_suspend, status); - - /* Some platforms can't detect that the alarm triggered the - * wakeup, or (accordingly) disable it after it afterwards. - * It's supposed to give oneshot behavior; cope. - */ - alm.enabled = false; - rtc_set_alarm(rtc, &alm); -} - -static int __init has_wakealarm(struct device *dev, void *name_ptr) -{ - struct rtc_device *candidate = to_rtc_device(dev); - - if (!candidate->ops->set_alarm) - return 0; - if (!device_may_wakeup(candidate->dev.parent)) - return 0; - - *(const char **)name_ptr = dev_name(dev); - return 1; -} - -/* - * Kernel options like "test_suspend=mem" force suspend/resume sanity tests - * at startup time. They're normally disabled, for faster boot and because - * we can't know which states really work on this particular system. - */ -static suspend_state_t test_state __initdata = PM_SUSPEND_ON; - -static char warn_bad_state[] __initdata = - KERN_WARNING "PM: can't test '%s' suspend state\n"; - -static int __init setup_test_suspend(char *value) -{ - unsigned i; - - /* "=mem" ==> "mem" */ - value++; - for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (!pm_states[i]) - continue; - if (strcmp(pm_states[i], value) != 0) - continue; - test_state = (__force suspend_state_t) i; - return 0; - } - printk(warn_bad_state, value); - return 0; -} -__setup("test_suspend", setup_test_suspend); - -static int __init test_suspend(void) -{ - static char warn_no_rtc[] __initdata = - KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; - - char *pony = NULL; - struct rtc_device *rtc = NULL; - - /* PM is initialized by now; is that state testable? */ - if (test_state == PM_SUSPEND_ON) - goto done; - if (!valid_state(test_state)) { - printk(warn_bad_state, pm_states[test_state]); - goto done; - } - - /* RTCs have initialized by now too ... can we use one? */ - class_find_device(rtc_class, NULL, &pony, has_wakealarm); - if (pony) - rtc = rtc_class_open(pony); - if (!rtc) { - printk(warn_no_rtc); - goto done; - } - - /* go for it */ - test_wakealarm(rtc, test_state); - rtc_class_close(rtc); -done: - return 0; -} -late_initcall(test_suspend); - -#endif /* CONFIG_PM_TEST_SUSPEND */ diff --git a/kernel/power/power.h b/kernel/power/power.h index 46b5ec7..26d5a26 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -45,7 +45,7 @@ static inline char *check_image_kernel(struct swsusp_info *info) */ #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) -/* kernel/power/disk.c */ +/* kernel/power/hibernate.c */ extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); @@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void); extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); -extern unsigned int count_data_pages(void); +extern int swsusp_shrink_memory(void); /** * Auxiliary structure used for reading the snapshot image data and @@ -147,9 +147,8 @@ extern int swsusp_swap_in_use(void); */ #define SF_PLATFORM_MODE 1 -/* kernel/power/disk.c */ +/* kernel/power/hibernate.c */ extern int swsusp_check(void); -extern int swsusp_shrink_memory(void); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); @@ -161,22 +160,36 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, unsigned int, char *); #ifdef CONFIG_SUSPEND -/* kernel/power/main.c */ +/* kernel/power/suspend.c */ +extern const char *const pm_states[]; + +extern bool valid_state(suspend_state_t state); extern int suspend_devices_and_enter(suspend_state_t state); +extern int enter_state(suspend_state_t state); #else /* !CONFIG_SUSPEND */ static inline int suspend_devices_and_enter(suspend_state_t state) { return -ENOSYS; } +static inline int enter_state(suspend_state_t state) { return -ENOSYS; } +static inline bool valid_state(suspend_state_t state) { return false; } #endif /* !CONFIG_SUSPEND */ +#ifdef CONFIG_PM_TEST_SUSPEND +/* kernel/power/suspend_test.c */ +extern void suspend_test_start(void); +extern void suspend_test_finish(const char *label); +#else /* !CONFIG_PM_TEST_SUSPEND */ +static inline void suspend_test_start(void) {} +static inline void suspend_test_finish(const char *label) {} +#endif /* !CONFIG_PM_TEST_SUSPEND */ + #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ extern int pm_notifier_call_chain(unsigned long val); #endif #ifdef CONFIG_HIGHMEM -unsigned int count_highmem_pages(void); int restore_highmem(void); #else static inline unsigned int count_highmem_pages(void) { return 0; } diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 9789083..e8b3370 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -34,7 +34,7 @@ static struct sysrq_key_op sysrq_poweroff_op = { .handler = handle_poweroff, .help_msg = "powerOff", .action_msg = "Power Off", - .enable_mask = SYSRQ_ENABLE_BOOT, + .enable_mask = SYSRQ_ENABLE_BOOT, }; static int pm_sysrq_init(void) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 33e2e4a..523a451 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -39,6 +39,14 @@ static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); static void swsusp_unset_page_forbidden(struct page *); +/* + * Preferred image size in bytes (tunable via /sys/power/image_size). + * When it is set to N, swsusp will do its best to ensure the image + * size will not exceed N bytes, but if that is impossible, it will + * try to create the smallest image possible. + */ +unsigned long image_size = 500 * 1024 * 1024; + /* List of PBEs needed for restoring the pages that were allocated before * the suspend and included in the suspend image, but have also been * allocated by the "resume" kernel, so their contents cannot be written @@ -840,7 +848,7 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) * pages. */ -unsigned int count_highmem_pages(void) +static unsigned int count_highmem_pages(void) { struct zone *zone; unsigned int n = 0; @@ -902,7 +910,7 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) * pages. */ -unsigned int count_data_pages(void) +static unsigned int count_data_pages(void) { struct zone *zone; unsigned long pfn, max_zone_pfn; @@ -1058,6 +1066,74 @@ void swsusp_free(void) buffer = NULL; } +/** + * swsusp_shrink_memory - Try to free as much memory as needed + * + * ... but do not OOM-kill anyone + * + * Notice: all userland should be stopped before it is called, or + * livelock is possible. + */ + +#define SHRINK_BITE 10000 +static inline unsigned long __shrink_memory(long tmp) +{ + if (tmp > SHRINK_BITE) + tmp = SHRINK_BITE; + return shrink_all_memory(tmp); +} + +int swsusp_shrink_memory(void) +{ + long tmp; + struct zone *zone; + unsigned long pages = 0; + unsigned int i = 0; + char *p = "-\\|/"; + struct timeval start, stop; + + printk(KERN_INFO "PM: Shrinking memory... "); + do_gettimeofday(&start); + do { + long size, highmem_size; + + highmem_size = count_highmem_pages(); + size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; + tmp = size; + size += highmem_size; + for_each_populated_zone(zone) { + tmp += snapshot_additional_pages(zone); + if (is_highmem(zone)) { + highmem_size -= + zone_page_state(zone, NR_FREE_PAGES); + } else { + tmp -= zone_page_state(zone, NR_FREE_PAGES); + tmp += zone->lowmem_reserve[ZONE_NORMAL]; + } + } + + if (highmem_size < 0) + highmem_size = 0; + + tmp += highmem_size; + if (tmp > 0) { + tmp = __shrink_memory(tmp); + if (!tmp) + return -ENOMEM; + pages += tmp; + } else if (size > image_size / PAGE_SIZE) { + tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); + pages += tmp; + } + printk("\b%c", p[i++%4]); + } while (tmp > 0); + do_gettimeofday(&stop); + printk("\bdone (%lu pages freed)\n", pages); + swsusp_show_speed(&start, &stop, pages, "Freed"); + + return 0; +} + #ifdef CONFIG_HIGHMEM /** * count_pages_for_highmem - compute the number of non-highmem pages diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c new file mode 100644 index 0000000..6f10dfc --- /dev/null +++ b/kernel/power/suspend.c @@ -0,0 +1,300 @@ +/* + * kernel/power/suspend.c - Suspend to RAM and standby functionality. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/console.h> +#include <linux/cpu.h> +#include <linux/syscalls.h> + +#include "power.h" + +const char *const pm_states[PM_SUSPEND_MAX] = { + [PM_SUSPEND_STANDBY] = "standby", + [PM_SUSPEND_MEM] = "mem", +}; + +static struct platform_suspend_ops *suspend_ops; + +/** + * suspend_set_ops - Set the global suspend method table. + * @ops: Pointer to ops structure. + */ +void suspend_set_ops(struct platform_suspend_ops *ops) +{ + mutex_lock(&pm_mutex); + suspend_ops = ops; + mutex_unlock(&pm_mutex); +} + +bool valid_state(suspend_state_t state) +{ + /* + * All states need lowlevel support and need to be valid to the lowlevel + * implementation, no valid callback implies that none are valid. + */ + return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); +} + +/** + * suspend_valid_only_mem - generic memory-only valid callback + * + * Platform drivers that implement mem suspend only and only need + * to check for that in their .valid callback can use this instead + * of rolling their own .valid callback. + */ +int suspend_valid_only_mem(suspend_state_t state) +{ + return state == PM_SUSPEND_MEM; +} + +static int suspend_test(int level) +{ +#ifdef CONFIG_PM_DEBUG + if (pm_test_level == level) { + printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); + mdelay(5000); + return 1; + } +#endif /* !CONFIG_PM_DEBUG */ + return 0; +} + +/** + * suspend_prepare - Do prep work before entering low-power state. + * + * This is common code that is called for each state that we're entering. + * Run suspend notifiers, allocate a console and stop all processes. + */ +static int suspend_prepare(void) +{ + int error; + + if (!suspend_ops || !suspend_ops->enter) + return -EPERM; + + pm_prepare_console(); + + error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); + if (error) + goto Finish; + + error = usermodehelper_disable(); + if (error) + goto Finish; + + error = suspend_freeze_processes(); + if (!error) + return 0; + + suspend_thaw_processes(); + usermodehelper_enable(); + Finish: + pm_notifier_call_chain(PM_POST_SUSPEND); + pm_restore_console(); + return error; +} + +/* default implementation */ +void __attribute__ ((weak)) arch_suspend_disable_irqs(void) +{ + local_irq_disable(); +} + +/* default implementation */ +void __attribute__ ((weak)) arch_suspend_enable_irqs(void) +{ + local_irq_enable(); +} + +/** + * suspend_enter - enter the desired system sleep state. + * @state: state to enter + * + * This function should be called after devices have been suspended. + */ +static int suspend_enter(suspend_state_t state) +{ + int error; + + if (suspend_ops->prepare) { + error = suspend_ops->prepare(); + if (error) + return error; + } + + error = dpm_suspend_noirq(PMSG_SUSPEND); + if (error) { + printk(KERN_ERR "PM: Some devices failed to power down\n"); + goto Platfrom_finish; + } + + if (suspend_ops->prepare_late) { + error = suspend_ops->prepare_late(); + if (error) + goto Power_up_devices; + } + + if (suspend_test(TEST_PLATFORM)) + goto Platform_wake; + + error = disable_nonboot_cpus(); + if (error || suspend_test(TEST_CPUS)) + goto Enable_cpus; + + arch_suspend_disable_irqs(); + BUG_ON(!irqs_disabled()); + + error = sysdev_suspend(PMSG_SUSPEND); + if (!error) { + if (!suspend_test(TEST_CORE)) + error = suspend_ops->enter(state); + sysdev_resume(); + } + + arch_suspend_enable_irqs(); + BUG_ON(irqs_disabled()); + + Enable_cpus: + enable_nonboot_cpus(); + + Platform_wake: + if (suspend_ops->wake) + suspend_ops->wake(); + + Power_up_devices: + dpm_resume_noirq(PMSG_RESUME); + + Platfrom_finish: + if (suspend_ops->finish) + suspend_ops->finish(); + + return error; +} + +/** + * suspend_devices_and_enter - suspend devices and enter the desired system + * sleep state. + * @state: state to enter + */ +int suspend_devices_and_enter(suspend_state_t state) +{ + int error; + + if (!suspend_ops) + return -ENOSYS; + + if (suspend_ops->begin) { + error = suspend_ops->begin(state); + if (error) + goto Close; + } + suspend_console(); + suspend_test_start(); + error = dpm_suspend_start(PMSG_SUSPEND); + if (error) { + printk(KERN_ERR "PM: Some devices failed to suspend\n"); + goto Recover_platform; + } + suspend_test_finish("suspend devices"); + if (suspend_test(TEST_DEVICES)) + goto Recover_platform; + + suspend_enter(state); + + Resume_devices: + suspend_test_start(); + dpm_resume_end(PMSG_RESUME); + suspend_test_finish("resume devices"); + resume_console(); + Close: + if (suspend_ops->end) + suspend_ops->end(); + return error; + + Recover_platform: + if (suspend_ops->recover) + suspend_ops->recover(); + goto Resume_devices; +} + +/** + * suspend_finish - Do final work before exiting suspend sequence. + * + * Call platform code to clean up, restart processes, and free the + * console that we've allocated. This is not called for suspend-to-disk. + */ +static void suspend_finish(void) +{ + suspend_thaw_processes(); + usermodehelper_enable(); + pm_notifier_call_chain(PM_POST_SUSPEND); + pm_restore_console(); +} + +/** + * enter_state - Do common work of entering low-power state. + * @state: pm_state structure for state we're entering. + * + * Make sure we're the only ones trying to enter a sleep state. Fail + * if someone has beat us to it, since we don't want anything weird to + * happen when we wake up. + * Then, do the setup for suspend, enter the state, and cleaup (after + * we've woken up). + */ +int enter_state(suspend_state_t state) +{ + int error; + + if (!valid_state(state)) + return -ENODEV; + + if (!mutex_trylock(&pm_mutex)) + return -EBUSY; + + printk(KERN_INFO "PM: Syncing filesystems ... "); + sys_sync(); + printk("done.\n"); + + pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); + error = suspend_prepare(); + if (error) + goto Unlock; + + if (suspend_test(TEST_FREEZER)) + goto Finish; + + pr_debug("PM: Entering %s sleep\n", pm_states[state]); + error = suspend_devices_and_enter(state); + + Finish: + pr_debug("PM: Finishing wakeup.\n"); + suspend_finish(); + Unlock: + mutex_unlock(&pm_mutex); + return error; +} + +/** + * pm_suspend - Externally visible function for suspending system. + * @state: Enumerated value of state to enter. + * + * Determine whether or not value is within range, get state + * structure, and enter (above). + */ +int pm_suspend(suspend_state_t state) +{ + if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) + return enter_state(state); + return -EINVAL; +} +EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c new file mode 100644 index 0000000..17d8bb1a --- /dev/null +++ b/kernel/power/suspend_test.c @@ -0,0 +1,187 @@ +/* + * kernel/power/suspend_test.c - Suspend to RAM and standby test facility. + * + * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz> + * + * This file is released under the GPLv2. + */ + +#include <linux/init.h> +#include <linux/rtc.h> + +#include "power.h" + +/* + * We test the system suspend code by setting an RTC wakealarm a short + * time in the future, then suspending. Suspending the devices won't + * normally take long ... some systems only need a few milliseconds. + * + * The time it takes is system-specific though, so when we test this + * during system bootup we allow a LOT of time. + */ +#define TEST_SUSPEND_SECONDS 5 + +static unsigned long suspend_test_start_time; + +void suspend_test_start(void) +{ + /* FIXME Use better timebase than "jiffies", ideally a clocksource. + * What we want is a hardware counter that will work correctly even + * during the irqs-are-off stages of the suspend/resume cycle... + */ + suspend_test_start_time = jiffies; +} + +void suspend_test_finish(const char *label) +{ + long nj = jiffies - suspend_test_start_time; + unsigned msec; + + msec = jiffies_to_msecs(abs(nj)); + pr_info("PM: %s took %d.%03d seconds\n", label, + msec / 1000, msec % 1000); + + /* Warning on suspend means the RTC alarm period needs to be + * larger -- the system was sooo slooowwww to suspend that the + * alarm (should have) fired before the system went to sleep! + * + * Warning on either suspend or resume also means the system + * has some performance issues. The stack dump of a WARN_ON + * is more likely to get the right attention than a printk... + */ + WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); +} + +/* + * To test system suspend, we need a hands-off mechanism to resume the + * system. RTCs wake alarms are a common self-contained mechanism. + */ + +static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) +{ + static char err_readtime[] __initdata = + KERN_ERR "PM: can't read %s time, err %d\n"; + static char err_wakealarm [] __initdata = + KERN_ERR "PM: can't set %s wakealarm, err %d\n"; + static char err_suspend[] __initdata = + KERN_ERR "PM: suspend test failed, error %d\n"; + static char info_test[] __initdata = + KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; + + unsigned long now; + struct rtc_wkalrm alm; + int status; + + /* this may fail if the RTC hasn't been initialized */ + status = rtc_read_time(rtc, &alm.time); + if (status < 0) { + printk(err_readtime, dev_name(&rtc->dev), status); + return; + } + rtc_tm_to_time(&alm.time, &now); + + memset(&alm, 0, sizeof alm); + rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); + alm.enabled = true; + + status = rtc_set_alarm(rtc, &alm); + if (status < 0) { + printk(err_wakealarm, dev_name(&rtc->dev), status); + return; + } + + if (state == PM_SUSPEND_MEM) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + if (status == -ENODEV) + state = PM_SUSPEND_STANDBY; + } + if (state == PM_SUSPEND_STANDBY) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + } + if (status < 0) + printk(err_suspend, status); + + /* Some platforms can't detect that the alarm triggered the + * wakeup, or (accordingly) disable it after it afterwards. + * It's supposed to give oneshot behavior; cope. + */ + alm.enabled = false; + rtc_set_alarm(rtc, &alm); +} + +static int __init has_wakealarm(struct device *dev, void *name_ptr) +{ + struct rtc_device *candidate = to_rtc_device(dev); + + if (!candidate->ops->set_alarm) + return 0; + if (!device_may_wakeup(candidate->dev.parent)) + return 0; + + *(const char **)name_ptr = dev_name(dev); + return 1; +} + +/* + * Kernel options like "test_suspend=mem" force suspend/resume sanity tests + * at startup time. They're normally disabled, for faster boot and because + * we can't know which states really work on this particular system. + */ +static suspend_state_t test_state __initdata = PM_SUSPEND_ON; + +static char warn_bad_state[] __initdata = + KERN_WARNING "PM: can't test '%s' suspend state\n"; + +static int __init setup_test_suspend(char *value) +{ + unsigned i; + + /* "=mem" ==> "mem" */ + value++; + for (i = 0; i < PM_SUSPEND_MAX; i++) { + if (!pm_states[i]) + continue; + if (strcmp(pm_states[i], value) != 0) + continue; + test_state = (__force suspend_state_t) i; + return 0; + } + printk(warn_bad_state, value); + return 0; +} +__setup("test_suspend", setup_test_suspend); + +static int __init test_suspend(void) +{ + static char warn_no_rtc[] __initdata = + KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; + + char *pony = NULL; + struct rtc_device *rtc = NULL; + + /* PM is initialized by now; is that state testable? */ + if (test_state == PM_SUSPEND_ON) + goto done; + if (!valid_state(test_state)) { + printk(warn_bad_state, pm_states[test_state]); + goto done; + } + + /* RTCs have initialized by now too ... can we use one? */ + class_find_device(rtc_class, NULL, &pony, has_wakealarm); + if (pony) + rtc = rtc_class_open(pony); + if (!rtc) { + printk(warn_no_rtc); + goto done; + } + + /* go for it */ + test_wakealarm(rtc, test_state); + rtc_class_close(rtc); +done: + return 0; +} +late_initcall(test_suspend); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 78c3504..6a07f4d 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -55,14 +55,6 @@ #include "power.h" -/* - * Preferred image size in bytes (tunable via /sys/power/image_size). - * When it is set to N, swsusp will do its best to ensure the image - * size will not exceed N bytes, but if that is impossible, it will - * try to create the smallest image possible. - */ -unsigned long image_size = 500 * 1024 * 1024; - int in_suspend __nosavedata = 0; /** @@ -194,193 +186,3 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop, centisecs / 100, centisecs % 100, kps / 1000, (kps % 1000) / 10); } - -/** - * swsusp_shrink_memory - Try to free as much memory as needed - * - * ... but do not OOM-kill anyone - * - * Notice: all userland should be stopped before it is called, or - * livelock is possible. - */ - -#define SHRINK_BITE 10000 -static inline unsigned long __shrink_memory(long tmp) -{ - if (tmp > SHRINK_BITE) - tmp = SHRINK_BITE; - return shrink_all_memory(tmp); -} - -int swsusp_shrink_memory(void) -{ - long tmp; - struct zone *zone; - unsigned long pages = 0; - unsigned int i = 0; - char *p = "-\\|/"; - struct timeval start, stop; - - printk(KERN_INFO "PM: Shrinking memory... "); - do_gettimeofday(&start); - do { - long size, highmem_size; - - highmem_size = count_highmem_pages(); - size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; - tmp = size; - size += highmem_size; - for_each_populated_zone(zone) { - tmp += snapshot_additional_pages(zone); - if (is_highmem(zone)) { - highmem_size -= - zone_page_state(zone, NR_FREE_PAGES); - } else { - tmp -= zone_page_state(zone, NR_FREE_PAGES); - tmp += zone->lowmem_reserve[ZONE_NORMAL]; - } - } - - if (highmem_size < 0) - highmem_size = 0; - - tmp += highmem_size; - if (tmp > 0) { - tmp = __shrink_memory(tmp); - if (!tmp) - return -ENOMEM; - pages += tmp; - } else if (size > image_size / PAGE_SIZE) { - tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); - pages += tmp; - } - printk("\b%c", p[i++%4]); - } while (tmp > 0); - do_gettimeofday(&stop); - printk("\bdone (%lu pages freed)\n", pages); - swsusp_show_speed(&start, &stop, pages, "Freed"); - - return 0; -} - -/* - * Platforms, like ACPI, may want us to save some memory used by them during - * hibernation and to restore the contents of this memory during the subsequent - * resume. The code below implements a mechanism allowing us to do that. - */ - -struct nvs_page { - unsigned long phys_start; - unsigned int size; - void *kaddr; - void *data; - struct list_head node; -}; - -static LIST_HEAD(nvs_list); - -/** - * hibernate_nvs_register - register platform NVS memory region to save - * @start - physical address of the region - * @size - size of the region - * - * The NVS region need not be page-aligned (both ends) and we arrange - * things so that the data from page-aligned addresses in this region will - * be copied into separate RAM pages. - */ -int hibernate_nvs_register(unsigned long start, unsigned long size) -{ - struct nvs_page *entry, *next; - - while (size > 0) { - unsigned int nr_bytes; - - entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); - if (!entry) - goto Error; - - list_add_tail(&entry->node, &nvs_list); - entry->phys_start = start; - nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); - entry->size = (size < nr_bytes) ? size : nr_bytes; - - start += entry->size; - size -= entry->size; - } - return 0; - - Error: - list_for_each_entry_safe(entry, next, &nvs_list, node) { - list_del(&entry->node); - kfree(entry); - } - return -ENOMEM; -} - -/** - * hibernate_nvs_free - free data pages allocated for saving NVS regions - */ -void hibernate_nvs_free(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - free_page((unsigned long)entry->data); - entry->data = NULL; - if (entry->kaddr) { - iounmap(entry->kaddr); - entry->kaddr = NULL; - } - } -} - -/** - * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions - */ -int hibernate_nvs_alloc(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) { - entry->data = (void *)__get_free_page(GFP_KERNEL); - if (!entry->data) { - hibernate_nvs_free(); - return -ENOMEM; - } - } - return 0; -} - -/** - * hibernate_nvs_save - save NVS memory regions - */ -void hibernate_nvs_save(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Saving platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - entry->kaddr = ioremap(entry->phys_start, entry->size); - memcpy(entry->data, entry->kaddr, entry->size); - } -} - -/** - * hibernate_nvs_restore - restore NVS memory regions - * - * This function is going to be called with interrupts disabled, so it - * cannot iounmap the virtual addresses used to access the NVS region. - */ -void hibernate_nvs_restore(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Restoring platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) - memcpy(entry->kaddr, entry->data, entry->size); -} diff --git a/kernel/profile.c b/kernel/profile.c index 7724e04..28cf26a 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -111,12 +111,6 @@ int __ref profile_init(void) /* only text is profiled */ prof_len = (_etext - _stext) >> prof_shift; buffer_bytes = prof_len*sizeof(atomic_t); - if (!slab_is_available()) { - prof_buffer = alloc_bootmem(buffer_bytes); - alloc_bootmem_cpumask_var(&prof_cpu_mask); - cpumask_copy(prof_cpu_mask, cpu_possible_mask); - return 0; - } if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) return -ENOMEM; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0692ab5..f6d8b8c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -25,16 +25,6 @@ /* - * Initialize a new task whose father had been ptraced. - * - * Called from copy_process(). - */ -void ptrace_fork(struct task_struct *child, unsigned long clone_flags) -{ - arch_ptrace_fork(child, clone_flags); -} - -/* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. * @@ -185,10 +175,11 @@ int ptrace_attach(struct task_struct *task) if (same_thread_group(task, current)) goto out; - /* Protect exec's credential calculations against our interference; - * SUID, SGID and LSM creds get determined differently under ptrace. + /* Protect the target's credential calculations against our + * interference; SUID, SGID and LSM creds get determined differently + * under ptrace. */ - retval = mutex_lock_interruptible(&task->cred_exec_mutex); + retval = mutex_lock_interruptible(&task->cred_guard_mutex); if (retval < 0) goto out; @@ -232,7 +223,7 @@ repeat: bad: write_unlock_irqrestore(&tasklist_lock, flags); task_unlock(task); - mutex_unlock(&task->cred_exec_mutex); + mutex_unlock(&task->cred_guard_mutex); out: return retval; } @@ -304,6 +295,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data) if (child->ptrace) { child->exit_code = data; dead = __ptrace_detach(current, child); + if (!child->exit_state) + wake_up_process(child); } write_unlock_irq(&tasklist_lock); diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index ce97a4d..beb0e65 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1356,17 +1356,11 @@ static int rcu_sched_grace_period(void *arg) rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - ret = 0; + ret = 0; /* unused */ __wait_event_interruptible(rcu_ctrlblk.sched_wq, rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, ret); - /* - * Signals would prevent us from sleeping, and we cannot - * do much with them in any case. So flush them. - */ - if (ret) - flush_signals(current); couldsleepnext = 0; } while (!kthread_should_stop()); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d2a372f..0dccfbb 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) check_cpu_stall(rsp, rdp); /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rdp->qs_pending) + if (rdp->qs_pending) { + rdp->n_rp_qs_pending++; return 1; + } /* Does this CPU have callbacks ready to invoke? */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (cpu_has_callbacks_ready_to_invoke(rdp)) { + rdp->n_rp_cb_ready++; return 1; + } /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) + if (cpu_needs_another_gp(rsp, rdp)) { + rdp->n_rp_cpu_needs_gp++; return 1; + } /* Has another RCU grace period completed? */ - if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */ + if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ + rdp->n_rp_gp_completed++; return 1; + } /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */ + if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ + rdp->n_rp_gp_started++; return 1; + } /* Has an RCU GP gone long enough to send resched IPIs &c? */ if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && - ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) + ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { + rdp->n_rp_need_fqs++; return 1; + } /* nothing to do */ + rdp->n_rp_need_nothing++; return 0; } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 4b1875b..fe1dcdb 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = { .release = single_release, }; -static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir; +static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) +{ + seq_printf(m, "%3d%cnp=%ld " + "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n", + rdp->cpu, + cpu_is_offline(rdp->cpu) ? '!' : ' ', + rdp->n_rcu_pending, + rdp->n_rp_qs_pending, + rdp->n_rp_cb_ready, + rdp->n_rp_cpu_needs_gp, + rdp->n_rp_gp_completed, + rdp->n_rp_gp_started, + rdp->n_rp_need_fqs, + rdp->n_rp_need_nothing); +} + +static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) +{ + int cpu; + struct rcu_data *rdp; + + for_each_possible_cpu(cpu) { + rdp = rsp->rda[cpu]; + if (rdp->beenonline) + print_one_rcu_pending(m, rdp); + } +} + +static int show_rcu_pending(struct seq_file *m, void *unused) +{ + seq_puts(m, "rcu:\n"); + print_rcu_pendings(m, &rcu_state); + seq_puts(m, "rcu_bh:\n"); + print_rcu_pendings(m, &rcu_bh_state); + return 0; +} + +static int rcu_pending_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcu_pending, NULL); +} + +static struct file_operations rcu_pending_fops = { + .owner = THIS_MODULE, + .open = rcu_pending_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *rcudir; +static struct dentry *datadir; +static struct dentry *datadir_csv; +static struct dentry *gpdir; +static struct dentry *hierdir; +static struct dentry *rcu_pendingdir; + static int __init rcuclassic_trace_init(void) { rcudir = debugfs_create_dir("rcu", NULL); @@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void) NULL, &rcuhier_fops); if (!hierdir) goto free_out; + + rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir, + NULL, &rcu_pending_fops); + if (!rcu_pendingdir) + goto free_out; return 0; free_out: if (datadir) @@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void) debugfs_remove(datadir_csv); debugfs_remove(gpdir); debugfs_remove(hierdir); + debugfs_remove(rcu_pendingdir); debugfs_remove(rcudir); } diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 69d9cb9..fcd107a 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * assigned pending owner [which might not have taken the * lock yet]: */ -static inline int try_to_steal_lock(struct rt_mutex *lock) +static inline int try_to_steal_lock(struct rt_mutex *lock, + struct task_struct *task) { struct task_struct *pendowner = rt_mutex_owner(lock); struct rt_mutex_waiter *next; @@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) if (!rt_mutex_owner_pending(lock)) return 0; - if (pendowner == current) + if (pendowner == task) return 1; spin_lock_irqsave(&pendowner->pi_lock, flags); - if (current->prio >= pendowner->prio) { + if (task->prio >= pendowner->prio) { spin_unlock_irqrestore(&pendowner->pi_lock, flags); return 0; } @@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) * We are going to steal the lock and a waiter was * enqueued on the pending owners pi_waiters queue. So * we have to enqueue this waiter into - * current->pi_waiters list. This covers the case, - * where current is boosted because it holds another + * task->pi_waiters list. This covers the case, + * where task is boosted because it holds another * lock and gets unboosted because the booster is * interrupted, so we would delay a waiter with higher - * priority as current->normal_prio. + * priority as task->normal_prio. * * Note: in the rare case of a SCHED_OTHER task changing * its priority and thus stealing the lock, next->task - * might be current: + * might be task: */ - if (likely(next->task != current)) { - spin_lock_irqsave(¤t->pi_lock, flags); - plist_add(&next->pi_list_entry, ¤t->pi_waiters); - __rt_mutex_adjust_prio(current); - spin_unlock_irqrestore(¤t->pi_lock, flags); + if (likely(next->task != task)) { + spin_lock_irqsave(&task->pi_lock, flags); + plist_add(&next->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + spin_unlock_irqrestore(&task->pi_lock, flags); } return 1; } @@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ mark_rt_mutex_waiters(lock); - if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) + if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) return 0; /* We got the lock. */ @@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, + struct task_struct *task, int detect_deadlock) { struct task_struct *owner = rt_mutex_owner(lock); @@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, unsigned long flags; int chain_walk = 0, res; - spin_lock_irqsave(¤t->pi_lock, flags); - __rt_mutex_adjust_prio(current); - waiter->task = current; + spin_lock_irqsave(&task->pi_lock, flags); + __rt_mutex_adjust_prio(task); + waiter->task = task; waiter->lock = lock; - plist_node_init(&waiter->list_entry, current->prio); - plist_node_init(&waiter->pi_list_entry, current->prio); + plist_node_init(&waiter->list_entry, task->prio); + plist_node_init(&waiter->pi_list_entry, task->prio); /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) top_waiter = rt_mutex_top_waiter(lock); plist_add(&waiter->list_entry, &lock->wait_list); - current->pi_blocked_on = waiter; + task->pi_blocked_on = waiter; - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock_irqrestore(&task->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { spin_lock_irqsave(&owner->pi_lock, flags); @@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, spin_unlock(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, - current); + task); spin_lock(&lock->wait_lock); @@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task) rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); } -/* - * Slow path lock function: +/** + * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop + * @lock: the rt_mutex to take + * @state: the state the task should block in (TASK_INTERRUPTIBLE + * or TASK_UNINTERRUPTIBLE) + * @timeout: the pre-initialized and started timer, or NULL for none + * @waiter: the pre-initialized rt_mutex_waiter + * @detect_deadlock: passed to task_blocks_on_rt_mutex + * + * lock->wait_lock must be held by the caller. */ static int __sched -rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock) +__rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + struct rt_mutex_waiter *waiter, + int detect_deadlock) { - struct rt_mutex_waiter waiter; int ret = 0; - debug_rt_mutex_init_waiter(&waiter); - waiter.task = NULL; - - spin_lock(&lock->wait_lock); - - /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock)) { - spin_unlock(&lock->wait_lock); - return 0; - } - - set_current_state(state); - - /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) { - hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } - for (;;) { /* Try to acquire the lock: */ if (try_to_take_rt_mutex(lock)) @@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, } /* - * waiter.task is NULL the first time we come here and + * waiter->task is NULL the first time we come here and * when we have been woken up by the previous owner * but the lock got stolen by a higher prio task. */ - if (!waiter.task) { - ret = task_blocks_on_rt_mutex(lock, &waiter, + if (!waiter->task) { + ret = task_blocks_on_rt_mutex(lock, waiter, current, detect_deadlock); /* * If we got woken up by the owner then start loop * all over without going into schedule to try * to get the lock now: */ - if (unlikely(!waiter.task)) { + if (unlikely(!waiter->task)) { /* * Reset the return value. We might * have returned with -EDEADLK and the @@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, spin_unlock(&lock->wait_lock); - debug_rt_mutex_print_deadlock(&waiter); + debug_rt_mutex_print_deadlock(waiter); - if (waiter.task) + if (waiter->task) schedule_rt_mutex(lock); spin_lock(&lock->wait_lock); set_current_state(state); } + return ret; +} + +/* + * Slow path lock function: + */ +static int __sched +rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock) +{ + struct rt_mutex_waiter waiter; + int ret = 0; + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + + spin_lock(&lock->wait_lock); + + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock)) { + spin_unlock(&lock->wait_lock); + return 0; + } + + set_current_state(state); + + /* Setup the timer, when timeout != NULL */ + if (unlikely(timeout)) { + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } + + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, + detect_deadlock); + set_current_state(TASK_RUNNING); if (unlikely(waiter.task)) @@ -864,9 +891,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); /** - * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible - * the timeout structure is provided - * by the caller + * rt_mutex_timed_lock - lock a rt_mutex interruptible + * the timeout structure is provided + * by the caller * * @lock: the rt_mutex to be locked * @timeout: timeout structure or NULL (no timeout) @@ -875,7 +902,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); * Returns: * 0 on success * -EINTR when interrupted by a signal - * -ETIMEOUT when the timeout expired + * -ETIMEDOUT when the timeout expired * -EDEADLK when the lock would deadlock (when deadlock detection is on) */ int @@ -913,7 +940,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) } EXPORT_SYMBOL_GPL(rt_mutex_unlock); -/*** +/** * rt_mutex_destroy - mark a mutex unusable * @lock: the mutex to be destroyed * @@ -986,6 +1013,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, } /** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * @detect_deadlock: perform deadlock detection (1) or not (0) + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for FUTEX_REQUEUE_PI support. + */ +int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, int detect_deadlock) +{ + int ret; + + spin_lock(&lock->wait_lock); + + mark_rt_mutex_waiters(lock); + + if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { + /* We got the lock for task. */ + debug_rt_mutex_lock(lock); + + rt_mutex_set_owner(lock, task, 0); + + rt_mutex_deadlock_account_lock(lock, task); + return 1; + } + + ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); + + + if (ret && !waiter->task) { + /* + * Reset the return value. We might have + * returned with -EDEADLK and the owner + * released the lock while we were walking the + * pi chain. Let the waiter sort it out. + */ + ret = 0; + } + spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(waiter); + + return ret; +} + +/** * rt_mutex_next_owner - return the next owner of the lock * * @lock: the rt lock query @@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) return rt_mutex_top_waiter(lock)->task; } + +/** + * rt_mutex_finish_proxy_lock() - Complete lock acquisition + * @lock: the rt_mutex we were woken on + * @to: the timeout, null if none. hrtimer should already have + * been started. + * @waiter: the pre-initialized rt_mutex_waiter + * @detect_deadlock: perform deadlock detection (1) or not (0) + * + * Complete the lock acquisition started our behalf by another thread. + * + * Returns: + * 0 - success + * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK + * + * Special API call for PI-futex requeue support + */ +int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter, + int detect_deadlock) +{ + int ret; + + spin_lock(&lock->wait_lock); + + set_current_state(TASK_INTERRUPTIBLE); + + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, + detect_deadlock); + + set_current_state(TASK_RUNNING); + + if (unlikely(waiter->task)) + remove_waiter(lock, waiter); + + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + spin_unlock(&lock->wait_lock); + + /* + * Readjust priority, when we did not get the lock. We might have been + * the pending owner and boosted. Since we did not take the lock, the + * PI boost has to go. + */ + if (unlikely(ret)) + rt_mutex_adjust_prio(current); + + return ret; +} diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index e124bf5..97a2f81 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); +extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, + int detect_deadlock); +extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter, + int detect_deadlock); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" diff --git a/kernel/sched.c b/kernel/sched.c index 26efa47..8ec9d13 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,6 +39,7 @@ #include <linux/completion.h> #include <linux/kernel_stat.h> #include <linux/debug_locks.h> +#include <linux/perf_counter.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/profile.h> @@ -68,17 +69,18 @@ #include <linux/pagemap.h> #include <linux/hrtimer.h> #include <linux/tick.h> -#include <linux/bootmem.h> #include <linux/debugfs.h> #include <linux/ctype.h> #include <linux/ftrace.h> -#include <trace/sched.h> #include <asm/tlb.h> #include <asm/irq_regs.h> #include "sched_cpupri.h" +#define CREATE_TRACE_POINTS +#include <trace/events/sched.h> + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -118,12 +120,6 @@ */ #define RUNTIME_INF ((u64)~0ULL) -DEFINE_TRACE(sched_wait_task); -DEFINE_TRACE(sched_wakeup); -DEFINE_TRACE(sched_wakeup_new); -DEFINE_TRACE(sched_switch); -DEFINE_TRACE(sched_migrate_task); - #ifdef CONFIG_SMP static void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -584,6 +580,7 @@ struct rq { struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; + u64 nr_migrations_in; struct cfs_rq cfs; struct rt_rq rt; @@ -630,6 +627,10 @@ struct rq { struct list_head migration_queue; #endif + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP int hrtick_csd_pending; @@ -692,7 +693,7 @@ static inline int cpu_of(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -static inline void update_rq_clock(struct rq *rq) +inline void update_rq_clock(struct rq *rq) { rq->clock = sched_clock_cpu(cpu_of(rq)); } @@ -1728,6 +1729,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) } #endif +static void calc_load_account_active(struct rq *this_rq); + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -1958,7 +1961,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) clock_offset = old_rq->clock - new_rq->clock; - trace_sched_migrate_task(p, task_cpu(p), new_cpu); + trace_sched_migrate_task(p, new_cpu); #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) @@ -1967,12 +1970,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.sleep_start -= clock_offset; if (p->se.block_start) p->se.block_start -= clock_offset; +#endif if (old_cpu != new_cpu) { - schedstat_inc(p, se.nr_migrations); + p->se.nr_migrations++; + new_rq->nr_migrations_in++; +#ifdef CONFIG_SCHEDSTATS if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); - } #endif + perf_counter_task_migration(p, new_cpu); + } p->se.vruntime -= old_cfsrq->min_vruntime - new_cfsrq->min_vruntime; @@ -2015,6 +2022,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) } /* + * wait_task_context_switch - wait for a thread to complete at least one + * context switch. + * + * @p must not be current. + */ +void wait_task_context_switch(struct task_struct *p) +{ + unsigned long nvcsw, nivcsw, flags; + int running; + struct rq *rq; + + nvcsw = p->nvcsw; + nivcsw = p->nivcsw; + for (;;) { + /* + * The runqueue is assigned before the actual context + * switch. We need to take the runqueue lock. + * + * We could check initially without the lock but it is + * very likely that we need to take the lock in every + * iteration. + */ + rq = task_rq_lock(p, &flags); + running = task_running(rq, p); + task_rq_unlock(rq, &flags); + + if (likely(!running)) + break; + /* + * The switch count is incremented before the actual + * context switch. We thus wait for two switches to be + * sure at least one completed. + */ + if ((p->nvcsw - nvcsw) > 1) + break; + if ((p->nivcsw - nivcsw) > 1) + break; + + cpu_relax(); + } +} + +/* * wait_task_inactive - wait for a thread to unschedule. * * If @match_state is nonzero, it's the @p->state value just checked and @@ -2142,6 +2192,7 @@ void kick_process(struct task_struct *p) smp_send_reschedule(cpu); preempt_enable(); } +EXPORT_SYMBOL_GPL(kick_process); /* * Return a low guess at the load of a migration-source cpu weighted @@ -2324,6 +2375,27 @@ static int sched_balance_self(int cpu, int flag) #endif /* CONFIG_SMP */ +/** + * task_oncpu_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + */ +void task_oncpu_function_call(struct task_struct *p, + void (*func) (void *info), void *info) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if (task_curr(p)) + smp_call_function_single(cpu, func, info, 1); + preempt_enable(); +} + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2458,6 +2530,17 @@ out: return success; } +/** + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. Returns 1 if the process was woken up, 0 if it was already + * running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ int wake_up_process(struct task_struct *p) { return try_to_wake_up(p, TASK_ALL, 0); @@ -2480,6 +2563,7 @@ static void __sched_fork(struct task_struct *p) p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; p->se.last_wakeup = 0; p->se.avg_overlap = 0; p->se.start_runtime = 0; @@ -2710,6 +2794,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); + perf_counter_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); #ifdef CONFIG_SMP if (post_schedule) @@ -2766,7 +2851,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * combine the page table reload and the switch backend into * one hypercall. */ - arch_enter_lazy_cpu_mode(); + arch_start_context_switch(prev); if (unlikely(!mm)) { next->active_mm = oldmm; @@ -2856,19 +2941,81 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_active(void) +/* Variables and functions for calc_load */ +static atomic_long_t calc_load_tasks; +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) { - unsigned long i, running = 0, uninterruptible = 0; + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} - for_each_online_cpu(i) { - running += cpu_rq(i)->nr_running; - uninterruptible += cpu_rq(i)->nr_uninterruptible; - } +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + return load >> FSHIFT; +} - if (unlikely((long)uninterruptible < 0)) - uninterruptible = 0; +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(void) +{ + unsigned long upd = calc_load_update + 10; + long active; - return running + uninterruptible; + if (time_before(jiffies, upd)) + return; + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; +} + +/* + * Either called from update_cpu_load() or from a cpu going idle + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long nr_active, delta; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + atomic_long_add(delta, &calc_load_tasks); + } +} + +/* + * Externally visible per-cpu scheduler statistics: + * cpu_nr_migrations(cpu) - number of migrations into that cpu + */ +u64 cpu_nr_migrations(int cpu) +{ + return cpu_rq(cpu)->nr_migrations_in; } /* @@ -2899,6 +3046,11 @@ static void update_cpu_load(struct rq *this_rq) new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; } + + if (time_after_eq(jiffies, this_rq->calc_load_update)) { + this_rq->calc_load_update += LOAD_FREQ; + calc_load_account_active(this_rq); + } } #ifdef CONFIG_SMP @@ -4240,10 +4392,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) static struct { atomic_t load_balancer; cpumask_var_t cpu_mask; + cpumask_var_t ilb_grp_nohz_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), }; +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * lowest_flag_domain - Return lowest sched_domain containing flag. + * @cpu: The cpu whose lowest level of sched domain is to + * be returned. + * @flag: The flag to check for the lowest sched_domain + * for the given cpu. + * + * Returns the lowest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) + if (sd && (sd->flags & flag)) + break; + + return sd; +} + +/** + * for_each_flag_domain - Iterates over sched_domains containing the flag. + * @cpu: The cpu whose domains we're iterating over. + * @sd: variable holding the value of the power_savings_sd + * for cpu. + * @flag: The flag to filter the sched_domains to be iterated. + * + * Iterates over all the scheduler domains for a given cpu that has the 'flag' + * set, starting from the lowest sched_domain to the highest. + */ +#define for_each_flag_domain(cpu, sd, flag) \ + for (sd = lowest_flag_domain(cpu, flag); \ + (sd && (sd->flags & flag)); sd = sd->parent) + +/** + * is_semi_idle_group - Checks if the given sched_group is semi-idle. + * @ilb_group: group to be checked for semi-idleness + * + * Returns: 1 if the group is semi-idle. 0 otherwise. + * + * We define a sched_group to be semi idle if it has atleast one idle-CPU + * and atleast one non-idle CPU. This helper function checks if the given + * sched_group is semi-idle or not. + */ +static inline int is_semi_idle_group(struct sched_group *ilb_group) +{ + cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, + sched_group_cpus(ilb_group)); + + /* + * A sched_group is semi-idle when it has atleast one busy cpu + * and atleast one idle cpu. + */ + if (cpumask_empty(nohz.ilb_grp_nohz_mask)) + return 0; + + if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) + return 0; + + return 1; +} +/** + * find_new_ilb - Finds the optimum idle load balancer for nomination. + * @cpu: The cpu which is nominating a new idle_load_balancer. + * + * Returns: Returns the id of the idle load balancer if it exists, + * Else, returns >= nr_cpu_ids. + * + * This algorithm picks the idle load balancer such that it belongs to a + * semi-idle powersavings sched_domain. The idea is to try and avoid + * completely idle packages/cores just for the purpose of idle load balancing + * when there are other idle cpu's which are better suited for that job. + */ +static int find_new_ilb(int cpu) +{ + struct sched_domain *sd; + struct sched_group *ilb_group; + + /* + * Have idle load balancer selection from semi-idle packages only + * when power-aware load balancing is enabled + */ + if (!(sched_smt_power_savings || sched_mc_power_savings)) + goto out_done; + + /* + * Optimize for the case when we have no idle CPUs or only one + * idle CPU. Don't walk the sched_domain hierarchy in such cases + */ + if (cpumask_weight(nohz.cpu_mask) < 2) + goto out_done; + + for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { + ilb_group = sd->groups; + + do { + if (is_semi_idle_group(ilb_group)) + return cpumask_first(nohz.ilb_grp_nohz_mask); + + ilb_group = ilb_group->next; + + } while (ilb_group != sd->groups); + } + +out_done: + return cpumask_first(nohz.cpu_mask); +} +#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ +static inline int find_new_ilb(int call_cpu) +{ + return cpumask_first(nohz.cpu_mask); +} +#endif + /* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle @@ -4298,8 +4566,24 @@ int select_nohz_load_balancer(int stop_tick) /* make me the ilb owner */ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) + } else if (atomic_read(&nohz.load_balancer) == cpu) { + int new_ilb; + + if (!(sched_smt_power_savings || + sched_mc_power_savings)) + return 1; + /* + * Check to see if there is a more power-efficient + * ilb. + */ + new_ilb = find_new_ilb(cpu); + if (new_ilb < nr_cpu_ids && new_ilb != cpu) { + atomic_set(&nohz.load_balancer, -1); + resched_cpu(new_ilb); + return 0; + } return 1; + } } else { if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; @@ -4468,15 +4752,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) } if (atomic_read(&nohz.load_balancer) == -1) { - /* - * simple selection for now: Nominate the - * first cpu in the nohz list to be the next - * ilb owner. - * - * TBD: Traverse the sched domains and nominate - * the nearest cpu in the nohz.cpu_mask. - */ - int ilb = cpumask_first(nohz.cpu_mask); + int ilb = find_new_ilb(cpu); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@ -4840,6 +5116,8 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); spin_unlock(&rq->lock); + perf_counter_task_tick(curr, cpu); + #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); @@ -5007,13 +5285,15 @@ pick_next_task(struct rq *rq) /* * schedule() is the main scheduler function. */ -asmlinkage void __sched __schedule(void) +asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; +need_resched: + preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_qsctr_inc(cpu); @@ -5053,6 +5333,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); + perf_counter_task_sched_out(prev, next, cpu); rq->nr_switches++; rq->curr = next; @@ -5070,15 +5351,9 @@ need_resched_nonpreemptible: if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; -} -asmlinkage void __sched schedule(void) -{ -need_resched: - preempt_disable(); - __schedule(); preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (need_resched()) goto need_resched; } EXPORT_SYMBOL(schedule); @@ -5221,7 +5496,7 @@ EXPORT_SYMBOL(default_wake_function); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync, void *key) { wait_queue_t *curr, *next; @@ -5241,6 +5516,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5279,6 +5557,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5315,6 +5596,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * awakened in the same order in which they were queued. * * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete(struct completion *x) { @@ -5332,6 +5616,9 @@ EXPORT_SYMBOL(complete); * @x: holds the state of this particular completion * * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete_all(struct completion *x) { @@ -6490,8 +6777,9 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif - printk(KERN_CONT "%5lu %5d %6d\n", free, - task_pid_nr(p), task_pid_nr(p->real_parent)); + printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + task_pid_nr(p), task_pid_nr(p->real_parent), + (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); } @@ -6970,6 +7258,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu) } } + +/* + * remove the tasks which were accounted by rq from calc_load_tasks. + */ +static void calc_global_load_remove(struct rq *rq) +{ + atomic_long_sub(rq->calc_load_active, &calc_load_tasks); +} #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -7204,6 +7500,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); + rq->calc_load_update = calc_load_update; + rq->calc_load_active = 0; if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); @@ -7243,7 +7541,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); - + calc_global_load_remove(rq); /* * No need to migrate the tasks: it was best-effort if * they didn't take sched_hotcpu_mutex. Just wake up @@ -7279,8 +7577,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -/* Register at highest priority so that task migration (migrate_all_tasks) - * happens before everything else. +/* + * Register at high priority so that task migration (migrate_all_tasks) + * happens before everything else. This has to be lower priority than + * the notifier in the perf_counter subsystem, though. */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, @@ -7525,24 +7825,21 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) { + gfp_t gfp = GFP_KERNEL; + memset(rd, 0, sizeof(*rd)); - if (bootmem) { - alloc_bootmem_cpumask_var(&def_root_domain.span); - alloc_bootmem_cpumask_var(&def_root_domain.online); - alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); - cpupri_init(&rd->cpupri, true); - return 0; - } + if (bootmem) + gfp = GFP_NOWAIT; - if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->span, gfp)) goto out; - if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->online, gfp)) goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->rto_mask, gfp)) goto free_online; - if (cpupri_init(&rd->cpupri, false) != 0) + if (cpupri_init(&rd->cpupri, bootmem) != 0) goto free_rto_mask; return 0; @@ -7753,8 +8050,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* * The cpus mask in sched_group and sched_domain hangs off the end. - * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space - * for nr_cpu_ids < CONFIG_NR_CPUS. + * + * ( See the the comments in include/linux/sched.h:struct sched_group + * and struct sched_domain. ) */ struct static_sched_group { struct sched_group sg; @@ -7875,7 +8173,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) struct sched_domain *sd; sd = &per_cpu(phys_domains, j).sd; - if (j != cpumask_first(sched_group_cpus(sd->groups))) { + if (j != group_first_cpu(sd->groups)) { /* * Only add "power" once for each * physical package. @@ -7953,7 +8251,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) WARN_ON(!sd || !sd->groups); - if (cpu != cpumask_first(sched_group_cpus(sd->groups))) + if (cpu != group_first_cpu(sd->groups)) return; child = sd->child; @@ -8865,7 +9163,7 @@ void __init sched_init(void) * we use alloc_bootmem(). */ if (alloc_size) { - ptr = (unsigned long)alloc_bootmem(alloc_size); + ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.se = (struct sched_entity **)ptr; @@ -8938,6 +9236,8 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); rq->nr_running = 0; + rq->calc_load_active = 0; + rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -8958,7 +9258,7 @@ void __init sched_init(void) * 1024) and two child groups A0 and A1 (of weight 1024 each), * then A0's share of the cpu resource is: * - * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% + * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% * * We achieve this by letting init_task_group's tasks sit * directly in rq->cfs (i.e init_task_group->se[] = NULL). @@ -9045,20 +9345,26 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + + calc_load_update = jiffies + LOAD_FREQ; + /* * During early bootup we pretend to be a normal task: */ current->sched_class = &fair_sched_class; /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ - alloc_bootmem_cpumask_var(&nohz_cpu_mask); + alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - alloc_bootmem_cpumask_var(&nohz.cpu_mask); + alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); + alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); #endif - alloc_bootmem_cpumask_var(&cpu_isolated_map); + alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ + perf_counter_init(); + scheduler_running = 1; } @@ -9800,6 +10106,13 @@ static int sched_rt_global_constraints(void) if (sysctl_sched_rt_period <= 0) return -EINVAL; + /* + * There's always some RT tasks in the root group + * -- migration, kstopmachine etc.. + */ + if (sysctl_sched_rt_runtime == 0) + return -EBUSY; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 819f17a..e1d16c9 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -38,7 +38,8 @@ */ unsigned long long __attribute__((weak)) sched_clock(void) { - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); + return (unsigned long long)(jiffies - INITIAL_JIFFIES) + * (NSEC_PER_SEC / HZ); } static __read_mostly int sched_clock_running; diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index cdd3c89..7deffc9 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -154,8 +154,12 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) */ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) { + gfp_t gfp = GFP_KERNEL; int i; + if (bootmem) + gfp = GFP_NOWAIT; + memset(cp, 0, sizeof(*cp)); for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { @@ -163,9 +167,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) spin_lock_init(&vec->lock); vec->count = 0; - if (bootmem) - alloc_bootmem_cpumask_var(&vec->mask); - else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&vec->mask, gfp)) goto cleanup; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3816f21..5f9650e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) find_matching_se(&se, &pse); - while (se) { - BUG_ON(!pse); + BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) { - resched_task(curr); - break; - } - - se = parent_entity(se); - pse = parent_entity(pse); - } + if (wakeup_preempt_entity(se, pse) == 1) + resched_task(curr); } static struct task_struct *pick_next_task_fair(struct rq *rq) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 8a21a2e..499672c 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); - + /* adjust the active tasks as we might go into a long sleep */ + calc_load_account_active(rq); return rq->idle; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f2c66f8..9bf0d2a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1591,7 +1591,7 @@ static inline void init_sched_rt_class(void) unsigned int i; for_each_possible_cpu(i) - alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), + zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), GFP_KERNEL, cpu_to_node(i)); } #endif /* CONFIG_SMP */ diff --git a/kernel/signal.c b/kernel/signal.c index d803473..809a228 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -27,7 +27,7 @@ #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <linux/nsproxy.h> -#include <trace/sched.h> +#include <trace/events/sched.h> #include <asm/param.h> #include <asm/uaccess.h> @@ -41,8 +41,6 @@ static struct kmem_cache *sigqueue_cachep; -DEFINE_TRACE(sched_signal_send); - static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; @@ -249,14 +247,19 @@ void flush_sigqueue(struct sigpending *queue) /* * Flush all pending signals for a task. */ +void __flush_signals(struct task_struct *t) +{ + clear_tsk_thread_flag(t, TIF_SIGPENDING); + flush_sigqueue(&t->pending); + flush_sigqueue(&t->signal->shared_pending); +} + void flush_signals(struct task_struct *t) { unsigned long flags; spin_lock_irqsave(&t->sighand->siglock, flags); - clear_tsk_thread_flag(t, TIF_SIGPENDING); - flush_sigqueue(&t->pending); - flush_sigqueue(&t->signal->shared_pending); + __flush_signals(t); spin_unlock_irqrestore(&t->sighand->siglock, flags); } @@ -2278,24 +2281,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) return kill_something_info(sig, &info, pid); } -static int do_tkill(pid_t tgid, pid_t pid, int sig) +static int +do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) { - int error; - struct siginfo info; struct task_struct *p; unsigned long flags; - - error = -ESRCH; - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_TKILL; - info.si_pid = task_tgid_vnr(current); - info.si_uid = current_uid(); + int error = -ESRCH; rcu_read_lock(); p = find_task_by_vpid(pid); if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { - error = check_kill_permission(sig, &info, p); + error = check_kill_permission(sig, info, p); /* * The null signal is a permissions and process existence * probe. No signal is actually delivered. @@ -2305,7 +2301,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) * signal is private anyway. */ if (!error && sig && lock_task_sighand(p, &flags)) { - error = specific_send_sig_info(sig, &info, p); + error = specific_send_sig_info(sig, info, p); unlock_task_sighand(p, &flags); } } @@ -2314,6 +2310,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) return error; } +static int do_tkill(pid_t tgid, pid_t pid, int sig) +{ + struct siginfo info; + + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_TKILL; + info.si_pid = task_tgid_vnr(current); + info.si_uid = current_uid(); + + return do_send_specific(tgid, pid, sig, &info); +} + /** * sys_tgkill - send signal to one specific thread * @tgid: the thread group ID of the thread @@ -2363,6 +2372,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, return kill_proc_info(sig, &info, pid); } +long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) +{ + /* This is only valid for single tasks */ + if (pid <= 0 || tgid <= 0) + return -EINVAL; + + /* Not even root can pretend to send signals from the kernel. + Nor can they impersonate a kill(), which adds source info. */ + if (info->si_code >= 0) + return -EPERM; + info->si_signo = sig; + + return do_send_specific(tgid, pid, sig, info); +} + +SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, + siginfo_t __user *, uinfo) +{ + siginfo_t info; + + if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) + return -EFAULT; + + return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); +} + int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *t = current; diff --git a/kernel/slow-work.c b/kernel/slow-work.c index b28d1913..521ed20 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -372,8 +372,8 @@ static int slow_work_thread(void *_data) vsmax *= atomic_read(&slow_work_thread_count); vsmax /= 100; - prepare_to_wait(&slow_work_thread_wq, &wait, - TASK_INTERRUPTIBLE); + prepare_to_wait_exclusive(&slow_work_thread_wq, &wait, + TASK_INTERRUPTIBLE); if (!freezing(current) && !slow_work_threads_should_exit && !slow_work_available(vsmax) && diff --git a/kernel/smp.c b/kernel/smp.c index 858baac..ad63d85 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, + if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return NOTIFY_BAD; break; diff --git a/kernel/softirq.c b/kernel/softirq.c index b525dd3..258885a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -24,7 +24,9 @@ #include <linux/ftrace.h> #include <linux/smp.h> #include <linux/tick.h> -#include <trace/irq.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/irq.h> #include <asm/irq.h> /* @@ -186,9 +188,6 @@ EXPORT_SYMBOL(local_bh_enable_ip); */ #define MAX_SOFTIRQ_RESTART 10 -DEFINE_TRACE(softirq_entry); -DEFINE_TRACE(softirq_exit); - asmlinkage void __do_softirq(void) { struct softirq_action *h; @@ -828,7 +827,7 @@ int __init __weak arch_early_irq_init(void) return 0; } -int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) +int __weak arch_init_chip_data(struct irq_desc *desc, int node) { return 0; } diff --git a/kernel/sys.c b/kernel/sys.c index e7998cf..438d99a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -14,6 +14,7 @@ #include <linux/prctl.h> #include <linux/highuid.h> #include <linux/fs.h> +#include <linux/perf_counter.h> #include <linux/resource.h> #include <linux/kernel.h> #include <linux/kexec.h> @@ -1793,6 +1794,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; + case PR_TASK_PERF_COUNTERS_DISABLE: + error = perf_counter_task_disable(); + break; + case PR_TASK_PERF_COUNTERS_ENABLE: + error = perf_counter_task_enable(); + break; case PR_GET_TIMERSLACK: error = current->timer_slack_ns; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 27dad29..68320f6 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); + +/* performance counters: */ +cond_syscall(sys_perf_counter_open); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ea78fa1..ce664f9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,6 +49,7 @@ #include <linux/reboot.h> #include <linux/ftrace.h> #include <linux/slow-work.h> +#include <linux/perf_counter.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -101,7 +102,6 @@ static int __maybe_unused one = 1; static int __maybe_unused two = 2; static unsigned long one_ul = 1; static int one_hundred = 100; -static int one_thousand = 1000; /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -115,6 +115,7 @@ static int ngroups_max = NGROUPS_MAX; #ifdef CONFIG_MODULES extern char modprobe_path[]; +extern int modules_disabled; #endif #ifdef CONFIG_CHR_DEV_SG extern int sg_big_buff; @@ -535,6 +536,17 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dostring, .strategy = &sysctl_string, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "modules_disabled", + .data = &modules_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = &proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, + }, #endif #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) { @@ -732,6 +744,14 @@ static struct ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, + .procname = "bootloader_version", + .data = &bootloader_version, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, .procname = "kstack_depth_to_print", .data = &kstack_depth_to_print, .maxlen = sizeof(int), @@ -913,6 +933,32 @@ static struct ctl_table kern_table[] = { .child = slow_work_sysctls, }, #endif +#ifdef CONFIG_PERF_COUNTERS + { + .ctl_name = CTL_UNNUMBERED, + .procname = "perf_counter_paranoid", + .data = &sysctl_perf_counter_paranoid, + .maxlen = sizeof(sysctl_perf_counter_paranoid), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "perf_counter_mlock_kb", + .data = &sysctl_perf_counter_mlock, + .maxlen = sizeof(sysctl_perf_counter_mlock), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "perf_counter_max_sample_rate", + .data = &sysctl_perf_counter_sample_rate, + .maxlen = sizeof(sysctl_perf_counter_sample_rate), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -1034,28 +1080,6 @@ static struct ctl_table vm_table[] = { .proc_handler = &proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, - .procname = "nr_pdflush_threads_min", - .data = &nr_pdflush_threads_min, - .maxlen = sizeof nr_pdflush_threads_min, - .mode = 0644 /* read-write */, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &one, - .extra2 = &nr_pdflush_threads_max, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "nr_pdflush_threads_max", - .data = &nr_pdflush_threads_max, - .maxlen = sizeof nr_pdflush_threads_max, - .mode = 0644 /* read-write */, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &nr_pdflush_threads_min, - .extra2 = &one_thousand, - }, - { .ctl_name = VM_SWAPPINESS, .procname = "swappiness", .data = &vm_swappiness, @@ -1248,7 +1272,6 @@ static struct ctl_table vm_table[] = { .strategy = &sysctl_jiffies, }, #endif -#ifdef CONFIG_SECURITY { .ctl_name = CTL_UNNUMBERED, .procname = "mmap_min_addr", @@ -1257,7 +1280,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_doulongvec_minmax, }, -#endif #ifdef CONFIG_NUMA { .ctl_name = CTL_UNNUMBERED, diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ecfd7b5..80189f6 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -402,9 +402,6 @@ int clocksource_register(struct clocksource *c) unsigned long flags; int ret; - /* save mult_orig on registration */ - c->mult_orig = c->mult; - spin_lock_irqsave(&clocksource_lock, flags); ret = clocksource_enqueue(c); if (!ret) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 687dff4..e8c77d9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -22,7 +22,7 @@ /* * This read-write spinlock protects us from races in SMP while - * playing with xtime and avenrun. + * playing with xtime. */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); @@ -77,6 +77,10 @@ static void clocksource_forward_now(void) clock->cycle_last = cycle_now; nsec = cyc2ns(clock, cycle_delta); + + /* If arch requires, add in gettimeoffset() */ + nsec += arch_gettimeoffset(); + timespec_add_ns(&xtime, nsec); nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; @@ -111,6 +115,9 @@ void getnstimeofday(struct timespec *ts) /* convert to nanoseconds: */ nsecs = cyc2ns(clock, cycle_delta); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); + } while (read_seqretry(&xtime_lock, seq)); timespec_add_ns(ts, nsecs); diff --git a/kernel/timer.c b/kernel/timer.c index cffffad..faf2db8 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,6 +37,7 @@ #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> +#include <linux/perf_counter.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -756,6 +757,7 @@ void add_timer_on(struct timer_list *timer, int cpu) wake_up_idle_cpu(cpu); spin_unlock_irqrestore(&base->lock, flags); } +EXPORT_SYMBOL_GPL(add_timer_on); /** * del_timer - deactive a timer. @@ -1123,53 +1125,14 @@ void update_process_times(int user_tick) } /* - * Nr of active tasks - counted in fixed-point numbers - */ -static unsigned long count_active_tasks(void) -{ - return nr_active() * FIXED_1; -} - -/* - * Hmm.. Changed this, as the GNU make sources (load.c) seems to - * imply that avenrun[] is the standard name for this kind of thing. - * Nothing else seems to be standardized: the fractional size etc - * all seem to differ on different machines. - * - * Requires xtime_lock to access. - */ -unsigned long avenrun[3]; - -EXPORT_SYMBOL(avenrun); - -/* - * calc_load - given tick count, update the avenrun load estimates. - * This is called while holding a write_lock on xtime_lock. - */ -static inline void calc_load(unsigned long ticks) -{ - unsigned long active_tasks; /* fixed-point */ - static int count = LOAD_FREQ; - - count -= ticks; - if (unlikely(count < 0)) { - active_tasks = count_active_tasks(); - do { - CALC_LOAD(avenrun[0], EXP_1, active_tasks); - CALC_LOAD(avenrun[1], EXP_5, active_tasks); - CALC_LOAD(avenrun[2], EXP_15, active_tasks); - count += LOAD_FREQ; - } while (count < 0); - } -} - -/* * This function runs timers and the timer-tq in bottom half context. */ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); + perf_counter_do_pending(); + hrtimer_run_pending(); if (time_after_eq(jiffies, base->timer_jiffies)) @@ -1187,16 +1150,6 @@ void run_local_timers(void) } /* - * Called by the timer interrupt. xtime_lock must already be taken - * by the timer IRQ! - */ -static inline void update_times(unsigned long ticks) -{ - update_wall_time(); - calc_load(ticks); -} - -/* * The 64-bit jiffies value is not atomic - you MUST NOT read it * without sampling the sequence number in xtime_lock. * jiffies is defined in the linker script... @@ -1205,7 +1158,8 @@ static inline void update_times(unsigned long ticks) void do_timer(unsigned long ticks) { jiffies_64 += ticks; - update_times(ticks); + update_wall_time(); + calc_global_load(); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1406,37 +1360,17 @@ int do_sysinfo(struct sysinfo *info) { unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; - unsigned long seq; + struct timespec tp; memset(info, 0, sizeof(struct sysinfo)); - do { - struct timespec tp; - seq = read_seqbegin(&xtime_lock); - - /* - * This is annoying. The below is the same thing - * posix_get_clock_monotonic() does, but it wants to - * take the lock which we want to cover the loads stuff - * too. - */ - - getnstimeofday(&tp); - tp.tv_sec += wall_to_monotonic.tv_sec; - tp.tv_nsec += wall_to_monotonic.tv_nsec; - monotonic_to_bootbased(&tp); - if (tp.tv_nsec - NSEC_PER_SEC >= 0) { - tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; - tp.tv_sec++; - } - info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + ktime_get_ts(&tp); + monotonic_to_bootbased(&tp); + info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); - info->procs = nr_threads; - } while (read_seqretry(&xtime_lock, seq)); + info->procs = nr_threads; si_meminfo(info); si_swapinfo(info); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 417d198..4a13e5a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -48,6 +48,21 @@ config FTRACE_NMI_ENTER depends on HAVE_FTRACE_NMI_ENTER default y +config EVENT_TRACING + select CONTEXT_SWITCH_TRACER + bool + +config CONTEXT_SWITCH_TRACER + select MARKERS + bool + +# All tracer options should select GENERIC_TRACER. For those options that are +# enabled by all tracers (context switch and event tracer) they select TRACING. +# This allows those options to appear when no other tracer is selected. But the +# options do not appear when something else selects it. We need the two options +# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the +# hidding of the automatic options options. + config TRACING bool select DEBUG_FS @@ -56,6 +71,11 @@ config TRACING select TRACEPOINTS select NOP_TRACER select BINARY_PRINTF + select EVENT_TRACING + +config GENERIC_TRACER + bool + select TRACING # # Minimum requirements an architecture has to meet for us to @@ -73,14 +93,20 @@ config TRACING_SUPPORT if TRACING_SUPPORT -menu "Tracers" +menuconfig FTRACE + bool "Tracers" + default y if DEBUG_KERNEL + help + Enable the kernel tracing infrastructure. + +if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER select FRAME_POINTER select KALLSYMS - select TRACING + select GENERIC_TRACER select CONTEXT_SWITCH_TRACER help Enable the kernel to trace every kernel function. This is done @@ -104,13 +130,14 @@ config FUNCTION_GRAPH_TRACER the return value. This is done by setting the current return address on the current task structure into a stack of calls. + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n depends on TRACE_IRQFLAGS_SUPPORT depends on GENERIC_TIME select TRACE_IRQFLAGS - select TRACING + select GENERIC_TRACER select TRACER_MAX_TRACE help This option measures the time spent in irqs-off critical @@ -131,7 +158,7 @@ config PREEMPT_TRACER default n depends on GENERIC_TIME depends on PREEMPT - select TRACING + select GENERIC_TRACER select TRACER_MAX_TRACE help This option measures the time spent in preemption off critical @@ -150,7 +177,7 @@ config PREEMPT_TRACER config SYSPROF_TRACER bool "Sysprof Tracer" depends on X86 - select TRACING + select GENERIC_TRACER select CONTEXT_SWITCH_TRACER help This tracer provides the trace needed by the 'Sysprof' userspace @@ -158,40 +185,33 @@ config SYSPROF_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" - select TRACING + select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE help This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. -config CONTEXT_SWITCH_TRACER - bool "Trace process context switches" - select TRACING - select MARKERS - help - This tracer gets called from the context switch and records - all switching of tasks. - -config EVENT_TRACER - bool "Trace various events in the kernel" +config ENABLE_DEFAULT_TRACERS + bool "Trace process context switches and events" + depends on !GENERIC_TRACER select TRACING help This tracer hooks to various trace points in the kernel allowing the user to pick and choose which trace point they - want to trace. + want to trace. It also includes the sched_switch tracer plugin. config FTRACE_SYSCALLS bool "Trace syscalls" depends on HAVE_FTRACE_SYSCALLS - select TRACING + select GENERIC_TRACER select KALLSYMS help Basic tracer to catch the syscall entry and exit events. config BOOT_TRACER bool "Trace boot initcalls" - select TRACING + select GENERIC_TRACER select CONTEXT_SWITCH_TRACER help This tracer helps developers to optimize boot times: it records @@ -207,8 +227,36 @@ config BOOT_TRACER to enable this on bootup. config TRACE_BRANCH_PROFILING + bool + select GENERIC_TRACER + +choice + prompt "Branch Profiling" + default BRANCH_PROFILE_NONE + help + The branch profiling is a software profiler. It will add hooks + into the C conditionals to test which path a branch takes. + + The likely/unlikely profiler only looks at the conditions that + are annotated with a likely or unlikely macro. + + The "all branch" profiler will profile every if statement in the + kernel. This profiler will also enable the likely/unlikely + profiler as well. + + Either of the above profilers add a bit of overhead to the system. + If unsure choose "No branch profiling". + +config BRANCH_PROFILE_NONE + bool "No branch profiling" + help + No branch profiling. Branch profiling adds a bit of overhead. + Only enable it if you want to analyse the branching behavior. + Otherwise keep it disabled. + +config PROFILE_ANNOTATED_BRANCHES bool "Trace likely/unlikely profiler" - select TRACING + select TRACE_BRANCH_PROFILING help This tracer profiles all the the likely and unlikely macros in the kernel. It will display the results in: @@ -218,11 +266,9 @@ config TRACE_BRANCH_PROFILING Note: this will add a significant overhead, only turn this on if you need to profile the system's use of these macros. - Say N if unsure. - config PROFILE_ALL_BRANCHES bool "Profile all if conditionals" - depends on TRACE_BRANCH_PROFILING + select TRACE_BRANCH_PROFILING help This tracer profiles all branch conditions. Every if () taken in the kernel is recorded whether it hit or miss. @@ -230,11 +276,12 @@ config PROFILE_ALL_BRANCHES /debugfs/tracing/profile_branch + This option also enables the likely/unlikely profiler. + This configuration, when enabled, will impose a great overhead on the system. This should only be enabled when the system is to be analyzed - - Say N if unsure. +endchoice config TRACING_BRANCHES bool @@ -261,7 +308,7 @@ config BRANCH_TRACER config POWER_TRACER bool "Trace power consumption behavior" depends on X86 - select TRACING + select GENERIC_TRACER help This tracer helps developers to analyze and optimize the kernels power management decisions, specifically the C-state and P-state @@ -295,14 +342,14 @@ config STACK_TRACER config HW_BRANCH_TRACER depends on HAVE_HW_BRANCH_TRACER bool "Trace hw branches" - select TRACING + select GENERIC_TRACER help This tracer records all branches on the system in a circular buffer giving access to the last N branches for each cpu. config KMEMTRACE bool "Trace SLAB allocations" - select TRACING + select GENERIC_TRACER help kmemtrace provides tracing for slab allocator functions, such as kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected @@ -322,7 +369,7 @@ config KMEMTRACE config WORKQUEUE_TRACER bool "Trace workqueues" - select TRACING + select GENERIC_TRACER help The workqueue tracer provides some statistical informations about each cpu workqueue thread such as the number of the @@ -338,7 +385,7 @@ config BLK_DEV_IO_TRACE select RELAY select DEBUG_FS select TRACEPOINTS - select TRACING + select GENERIC_TRACER select STACKTRACE help Say Y here if you want to be able to trace the block layer actions @@ -375,6 +422,20 @@ config DYNAMIC_FTRACE were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. +config FUNCTION_PROFILER + bool "Kernel function profiler" + depends on FUNCTION_TRACER + default n + help + This option enables the kernel function profiler. A file is created + in debugfs called function_profile_enabled which defaults to zero. + When a 1 is echoed into this file profiling begins, and when a + zero is entered, profiling stops. A file in the trace_stats + directory called functions, that show the list of functions that + have been hit and their counters. + + If in doubt, say N + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE @@ -385,7 +446,7 @@ config FTRACE_SELFTEST config FTRACE_STARTUP_TEST bool "Perform a startup test on ftrace" - depends on TRACING + depends on GENERIC_TRACER select FTRACE_SELFTEST help This option performs a series of startup tests on ftrace. On bootup @@ -396,7 +457,7 @@ config FTRACE_STARTUP_TEST config MMIOTRACE bool "Memory mapped IO tracing" depends on HAVE_MMIOTRACE_SUPPORT && PCI - select TRACING + select GENERIC_TRACER help Mmiotrace traces Memory Mapped I/O access and is meant for debugging and reverse engineering. It is called from the ioremap @@ -416,7 +477,23 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. -endmenu +config RING_BUFFER_BENCHMARK + tristate "Ring buffer benchmark stress tester" + depends on RING_BUFFER + help + This option creates a test to stress the ring buffer and bench mark it. + It creates its own ring buffer such that it will not interfer with + any other users of the ring buffer (such as ftrace). It then creates + a producer and consumer that will run for 10 seconds and sleep for + 10 seconds. Each interval it will print out the number of events + it recorded and give a rough estimate of how long each iteration took. + + It does not disable interrupts or raise its priority, so it may be + affected by processes that are running. + + If unsure, say N + +endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2630f51..844164d 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -15,11 +15,17 @@ ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +# +# Make the trace clocks available generally: it's infrastructure +# relied on by ptrace for example: +# +obj-y += trace_clock.o + obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o +obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o obj-$(CONFIG_TRACING) += trace.o -obj-$(CONFIG_TRACING) += trace_clock.o obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o @@ -39,12 +45,14 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o -obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o -obj-$(CONFIG_EVENT_TRACER) += trace_events.o -obj-$(CONFIG_EVENT_TRACER) += events.o -obj-$(CONFIG_EVENT_TRACER) += trace_export.o +obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o +ifeq ($(CONFIG_BLOCK),y) +obj-$(CONFIG_EVENT_TRACING) += blktrace.o +endif +obj-$(CONFIG_EVENT_TRACING) += trace_events.o +obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o -obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 921ef5d..39af8af 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -23,10 +23,14 @@ #include <linux/mutex.h> #include <linux/debugfs.h> #include <linux/time.h> -#include <trace/block.h> #include <linux/uaccess.h> + +#include <trace/events/block.h> + #include "trace_output.h" +#ifdef CONFIG_BLK_DEV_IO_TRACE + static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; @@ -147,7 +151,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, { if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) return 1; - if (sector < bt->start_lba || sector > bt->end_lba) + if (sector && (sector < bt->start_lba || sector > bt->end_lba)) return 1; if (bt->pid && pid != bt->pid) return 1; @@ -192,7 +196,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(rw, DISCARD); pid = tsk->pid; - if (unlikely(act_log_check(bt, what, sector, pid))) + if (act_log_check(bt, what, sector, pid)) return; cpu = raw_smp_processor_id(); @@ -262,6 +266,7 @@ static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); + debugfs_remove(bt->dir); relay_close(bt->rchan); free_percpu(bt->sequence); free_percpu(bt->msg_data); @@ -403,11 +408,29 @@ static struct rchan_callbacks blk_relay_callbacks = { .remove_buf_file = blk_remove_buf_file_callback, }; +static void blk_trace_setup_lba(struct blk_trace *bt, + struct block_device *bdev) +{ + struct hd_struct *part = NULL; + + if (bdev) + part = bdev->bd_part; + + if (part) { + bt->start_lba = part->start_sect; + bt->end_lba = part->start_sect + part->nr_sects; + } else { + bt->start_lba = 0; + bt->end_lba = -1ULL; + } +} + /* * Setup everything required to start tracing */ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct blk_user_trace_setup *buts) + struct block_device *bdev, + struct blk_user_trace_setup *buts) { struct blk_trace *old_bt, *bt = NULL; struct dentry *dir = NULL; @@ -480,10 +503,13 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->act_mask) bt->act_mask = (u16) -1; - bt->start_lba = buts->start_lba; - bt->end_lba = buts->end_lba; - if (!bt->end_lba) - bt->end_lba = -1ULL; + blk_trace_setup_lba(bt, bdev); + + /* overwrite with user settings */ + if (buts->start_lba) + bt->start_lba = buts->start_lba; + if (buts->end_lba) + bt->end_lba = buts->end_lba; bt->pid = buts->pid; bt->trace_state = Blktrace_setup; @@ -505,6 +531,7 @@ err: } int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; @@ -514,7 +541,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (ret) return -EFAULT; - ret = do_blk_trace_setup(q, name, dev, &buts); + ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; @@ -582,7 +609,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) switch (cmd) { case BLKTRACESETUP: bdevname(bdev, b); - ret = blk_trace_setup(q, b, bdev->bd_dev, arg); + ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; case BLKTRACESTART: start = 1; @@ -642,12 +669,12 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (blk_pc_request(rq)) { what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, - rq->cmd_len, rq->cmd); + __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, + what, rq->errors, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, - rw, what, rq->errors, 0, NULL); + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, + what, rq->errors, 0, NULL); } } @@ -809,7 +836,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * @bio: the source bio * @dev: target device * @from: source sector - * @to: target sector * * Description: * Device mapper or raid target sometimes need to split a bio because @@ -817,7 +843,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * **/ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from, sector_t to) + dev_t dev, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -825,12 +851,13 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, if (likely(!bt)) return; - r.device = cpu_to_be32(dev); - r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); - r.sector = cpu_to_be64(to); + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); + r.sector_from = cpu_to_be64(from); - __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, - !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, + BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), + sizeof(r), &r); } /** @@ -854,11 +881,11 @@ void blk_add_driver_data(struct request_queue *q, return; if (blk_pc_request(rq)) - __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, - rq->errors, len, data); + __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, + BLK_TA_DRV_DATA, rq->errors, len, data); else - __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, - 0, BLK_TA_DRV_DATA, rq->errors, len, data); + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, + BLK_TA_DRV_DATA, rq->errors, len, data); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -971,6 +998,16 @@ static inline const void *pdu_start(const struct trace_entry *ent) return te_blk_io_trace(ent) + 1; } +static inline u32 t_action(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->action; +} + +static inline u32 t_bytes(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->bytes; +} + static inline u32 t_sec(const struct trace_entry *ent) { return te_blk_io_trace(ent)->bytes >> 9; @@ -996,11 +1033,11 @@ static void get_pdu_remap(const struct trace_entry *ent, struct blk_io_trace_remap *r) { const struct blk_io_trace_remap *__r = pdu_start(ent); - __u64 sector = __r->sector; + __u64 sector_from = __r->sector_from; - r->device = be32_to_cpu(__r->device); r->device_from = be32_to_cpu(__r->device_from); - r->sector = be64_to_cpu(sector); + r->device_to = be32_to_cpu(__r->device_to); + r->sector_from = be64_to_cpu(sector_from); } typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); @@ -1031,36 +1068,98 @@ static int blk_log_action(struct trace_iterator *iter, const char *act) MAJOR(t->device), MINOR(t->device), act, rwbs); } +static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) +{ + const unsigned char *pdu_buf; + int pdu_len; + int i, end, ret; + + pdu_buf = pdu_start(ent); + pdu_len = te_blk_io_trace(ent)->pdu_len; + + if (!pdu_len) + return 1; + + /* find the last zero that needs to be printed */ + for (end = pdu_len - 1; end >= 0; end--) + if (pdu_buf[end]) + break; + end++; + + if (!trace_seq_putc(s, '(')) + return 0; + + for (i = 0; i < pdu_len; i++) { + + ret = trace_seq_printf(s, "%s%02x", + i == 0 ? "" : " ", pdu_buf[i]); + if (!ret) + return ret; + + /* + * stop when the rest is just zeroes and indicate so + * with a ".." appended + */ + if (i == end && end != pdu_len - 1) + return trace_seq_puts(s, " ..) "); + } + + return trace_seq_puts(s, ") "); +} + static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%s]\n", - t_sector(ent), t_sec(ent), cmd); - return trace_seq_printf(s, "[%s]\n", cmd); + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + int ret; + + ret = trace_seq_printf(s, "%u ", t_bytes(ent)); + if (!ret) + return 0; + ret = blk_log_dump_pdu(s, ent); + if (!ret) + return 0; + return trace_seq_printf(s, "[%s]\n", cmd); + } else { + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%s]\n", + t_sector(ent), t_sec(ent), cmd); + return trace_seq_printf(s, "[%s]\n", cmd); + } } static int blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent) { - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), - t_sec(ent), t_error(ent)); - return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + int ret; + + ret = blk_log_dump_pdu(s, ent); + if (ret) + return trace_seq_printf(s, "[%d]\n", t_error(ent)); + return 0; + } else { + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%d]\n", + t_sector(ent), + t_sec(ent), t_error(ent)); + return trace_seq_printf(s, "%llu [%d]\n", + t_sector(ent), t_error(ent)); + } } static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) { - struct blk_io_trace_remap r = { .device = 0, }; + struct blk_io_trace_remap r = { .device_from = 0, }; get_pdu_remap(ent, &r); return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", - t_sector(ent), - t_sec(ent), MAJOR(r.device), MINOR(r.device), - (unsigned long long)r.sector); + t_sector(ent), t_sec(ent), + MAJOR(r.device_from), MINOR(r.device_from), + (unsigned long long)r.sector_from); } static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) @@ -1117,7 +1216,6 @@ static void blk_tracer_print_header(struct seq_file *m) static void blk_tracer_start(struct trace_array *tr) { blk_tracer_enabled = true; - trace_flags &= ~TRACE_ITER_CONTEXT_INFO; } static int blk_tracer_init(struct trace_array *tr) @@ -1130,7 +1228,6 @@ static int blk_tracer_init(struct trace_array *tr) static void blk_tracer_stop(struct trace_array *tr) { blk_tracer_enabled = false; - trace_flags |= TRACE_ITER_CONTEXT_INFO; } static void blk_tracer_reset(struct trace_array *tr) @@ -1182,7 +1279,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) - ret = trace_seq_printf(s, "Bad pc action %x\n", what); + ret = trace_seq_printf(s, "Unknown action %x\n", what); else { ret = log_action(iter, what2act[what].act[long_act]); if (ret) @@ -1195,9 +1292,6 @@ out: static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, int flags) { - if (!trace_print_context(iter)) - return TRACE_TYPE_PARTIAL_LINE; - return print_one_line(iter, false); } @@ -1232,6 +1326,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) return print_one_line(iter, true); } +static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) +{ + /* don't output context-info for blk_classic output */ + if (bit == TRACE_BLK_OPT_CLASSIC) { + if (set) + trace_flags &= ~TRACE_ITER_CONTEXT_INFO; + else + trace_flags |= TRACE_ITER_CONTEXT_INFO; + } + return 0; +} + static struct tracer blk_tracer __read_mostly = { .name = "blk", .init = blk_tracer_init, @@ -1241,6 +1347,7 @@ static struct tracer blk_tracer __read_mostly = { .print_header = blk_tracer_print_header, .print_line = blk_tracer_print_line, .flags = &blk_tracer_flags, + .set_flag = blk_tracer_set_flag, }; static struct trace_event trace_blk_event = { @@ -1285,7 +1392,8 @@ static int blk_trace_remove_queue(struct request_queue *q) /* * Setup everything required to start tracing */ -static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) +static int blk_trace_setup_queue(struct request_queue *q, + struct block_device *bdev) { struct blk_trace *old_bt, *bt = NULL; int ret = -ENOMEM; @@ -1298,9 +1406,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) if (!bt->msg_data) goto free_bt; - bt->dev = dev; + bt->dev = bdev->bd_dev; bt->act_mask = (u16)-1; - bt->end_lba = -1ULL; + + blk_trace_setup_lba(bt, bdev); old_bt = xchg(&q->blk_trace, bt); if (old_bt != NULL) { @@ -1517,7 +1626,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (attr == &dev_attr_enable) { if (value) - ret = blk_trace_setup_queue(q, bdev->bd_dev); + ret = blk_trace_setup_queue(q, bdev); else ret = blk_trace_remove_queue(q); goto out_unlock_bdev; @@ -1525,7 +1634,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, ret = 0; if (q->blk_trace == NULL) - ret = blk_trace_setup_queue(q, bdev->bd_dev); + ret = blk_trace_setup_queue(q, bdev); if (ret == 0) { if (attr == &dev_attr_act_mask) @@ -1548,3 +1657,77 @@ out: return ret ? ret : count; } +int blk_trace_init_sysfs(struct device *dev) +{ + return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); +} + +#endif /* CONFIG_BLK_DEV_IO_TRACE */ + +#ifdef CONFIG_EVENT_TRACING + +void blk_dump_cmd(char *buf, struct request *rq) +{ + int i, end; + int len = rq->cmd_len; + unsigned char *cmd = rq->cmd; + + if (!blk_pc_request(rq)) { + buf[0] = '\0'; + return; + } + + for (end = len - 1; end >= 0; end--) + if (cmd[end]) + break; + end++; + + for (i = 0; i < len; i++) { + buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]); + if (i == end && end != len - 1) { + sprintf(buf, " .."); + break; + } + } +} + +void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) +{ + int i = 0; + + if (rw & WRITE) + rwbs[i++] = 'W'; + else if (rw & 1 << BIO_RW_DISCARD) + rwbs[i++] = 'D'; + else if (bytes) + rwbs[i++] = 'R'; + else + rwbs[i++] = 'N'; + + if (rw & 1 << BIO_RW_AHEAD) + rwbs[i++] = 'A'; + if (rw & 1 << BIO_RW_BARRIER) + rwbs[i++] = 'B'; + if (rw & 1 << BIO_RW_SYNCIO) + rwbs[i++] = 'S'; + if (rw & 1 << BIO_RW_META) + rwbs[i++] = 'M'; + + rwbs[i] = '\0'; +} + +void blk_fill_rwbs_rq(char *rwbs, struct request *rq) +{ + int rw = rq->cmd_flags & 0x03; + int bytes; + + if (blk_discard_rq(rq)) + rw |= (1 << BIO_RW_DISCARD); + + bytes = blk_rq_bytes(rq); + + blk_fill_rwbs(rwbs, rw, bytes); +} + +#endif /* CONFIG_EVENT_TRACING */ + diff --git a/kernel/trace/events.c b/kernel/trace/events.c deleted file mode 100644 index 246f2aa..0000000 --- a/kernel/trace/events.c +++ /dev/null @@ -1,14 +0,0 @@ -/* - * This is the place to register all trace points as events. - */ - -#include <linux/stringify.h> - -#include <trace/trace_events.h> - -#include "trace_output.h" - -#include "trace_events_stage_1.h" -#include "trace_events_stage_2.h" -#include "trace_events_stage_3.h" - diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f1ed080..bb60732 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -29,11 +29,13 @@ #include <linux/list.h> #include <linux/hash.h> -#include <trace/sched.h> +#include <trace/events/sched.h> #include <asm/ftrace.h> +#include <asm/setup.h> -#include "trace.h" +#include "trace_output.h" +#include "trace_stat.h" #define FTRACE_WARN_ON(cond) \ do { \ @@ -68,7 +70,7 @@ static DEFINE_MUTEX(ftrace_lock); static struct ftrace_ops ftrace_list_end __read_mostly = { - .func = ftrace_stub, + .func = ftrace_stub, }; static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; @@ -240,6 +242,580 @@ static void ftrace_update_pid_func(void) #endif } +#ifdef CONFIG_FUNCTION_PROFILER +struct ftrace_profile { + struct hlist_node node; + unsigned long ip; + unsigned long counter; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + unsigned long long time; +#endif +}; + +struct ftrace_profile_page { + struct ftrace_profile_page *next; + unsigned long index; + struct ftrace_profile records[]; +}; + +struct ftrace_profile_stat { + atomic_t disabled; + struct hlist_head *hash; + struct ftrace_profile_page *pages; + struct ftrace_profile_page *start; + struct tracer_stat stat; +}; + +#define PROFILE_RECORDS_SIZE \ + (PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) + +#define PROFILES_PER_PAGE \ + (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) + +static int ftrace_profile_bits __read_mostly; +static int ftrace_profile_enabled __read_mostly; + +/* ftrace_profile_lock - synchronize the enable and disable of the profiler */ +static DEFINE_MUTEX(ftrace_profile_lock); + +static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); + +#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ + +static void * +function_stat_next(void *v, int idx) +{ + struct ftrace_profile *rec = v; + struct ftrace_profile_page *pg; + + pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); + + again: + rec++; + if ((void *)rec >= (void *)&pg->records[pg->index]) { + pg = pg->next; + if (!pg) + return NULL; + rec = &pg->records[0]; + if (!rec->counter) + goto again; + } + + return rec; +} + +static void *function_stat_start(struct tracer_stat *trace) +{ + struct ftrace_profile_stat *stat = + container_of(trace, struct ftrace_profile_stat, stat); + + if (!stat || !stat->start) + return NULL; + + return function_stat_next(&stat->start->records[0], 0); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* function graph compares on total time */ +static int function_stat_cmp(void *p1, void *p2) +{ + struct ftrace_profile *a = p1; + struct ftrace_profile *b = p2; + + if (a->time < b->time) + return -1; + if (a->time > b->time) + return 1; + else + return 0; +} +#else +/* not function graph compares against hits */ +static int function_stat_cmp(void *p1, void *p2) +{ + struct ftrace_profile *a = p1; + struct ftrace_profile *b = p2; + + if (a->counter < b->counter) + return -1; + if (a->counter > b->counter) + return 1; + else + return 0; +} +#endif + +static int function_stat_headers(struct seq_file *m) +{ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_printf(m, " Function " + "Hit Time Avg\n" + " -------- " + "--- ---- ---\n"); +#else + seq_printf(m, " Function Hit\n" + " -------- ---\n"); +#endif + return 0; +} + +static int function_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_profile *rec = v; + char str[KSYM_SYMBOL_LEN]; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + static DEFINE_MUTEX(mutex); + static struct trace_seq s; + unsigned long long avg; +#endif + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + seq_printf(m, " %-30.30s %10lu", str, rec->counter); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_printf(m, " "); + avg = rec->time; + do_div(avg, rec->counter); + + mutex_lock(&mutex); + trace_seq_init(&s); + trace_print_graph_duration(rec->time, &s); + trace_seq_puts(&s, " "); + trace_print_graph_duration(avg, &s); + trace_print_seq(m, &s); + mutex_unlock(&mutex); +#endif + seq_putc(m, '\n'); + + return 0; +} + +static void ftrace_profile_reset(struct ftrace_profile_stat *stat) +{ + struct ftrace_profile_page *pg; + + pg = stat->pages = stat->start; + + while (pg) { + memset(pg->records, 0, PROFILE_RECORDS_SIZE); + pg->index = 0; + pg = pg->next; + } + + memset(stat->hash, 0, + FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); +} + +int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) +{ + struct ftrace_profile_page *pg; + int functions; + int pages; + int i; + + /* If we already allocated, do nothing */ + if (stat->pages) + return 0; + + stat->pages = (void *)get_zeroed_page(GFP_KERNEL); + if (!stat->pages) + return -ENOMEM; + +#ifdef CONFIG_DYNAMIC_FTRACE + functions = ftrace_update_tot_cnt; +#else + /* + * We do not know the number of functions that exist because + * dynamic tracing is what counts them. With past experience + * we have around 20K functions. That should be more than enough. + * It is highly unlikely we will execute every function in + * the kernel. + */ + functions = 20000; +#endif + + pg = stat->start = stat->pages; + + pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); + + for (i = 0; i < pages; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + if (!pg->next) + goto out_free; + pg = pg->next; + } + + return 0; + + out_free: + pg = stat->start; + while (pg) { + unsigned long tmp = (unsigned long)pg; + + pg = pg->next; + free_page(tmp); + } + + free_page((unsigned long)stat->pages); + stat->pages = NULL; + stat->start = NULL; + + return -ENOMEM; +} + +static int ftrace_profile_init_cpu(int cpu) +{ + struct ftrace_profile_stat *stat; + int size; + + stat = &per_cpu(ftrace_profile_stats, cpu); + + if (stat->hash) { + /* If the profile is already created, simply reset it */ + ftrace_profile_reset(stat); + return 0; + } + + /* + * We are profiling all functions, but usually only a few thousand + * functions are hit. We'll make a hash of 1024 items. + */ + size = FTRACE_PROFILE_HASH_SIZE; + + stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); + + if (!stat->hash) + return -ENOMEM; + + if (!ftrace_profile_bits) { + size--; + + for (; size; size >>= 1) + ftrace_profile_bits++; + } + + /* Preallocate the function profiling pages */ + if (ftrace_profile_pages_init(stat) < 0) { + kfree(stat->hash); + stat->hash = NULL; + return -ENOMEM; + } + + return 0; +} + +static int ftrace_profile_init(void) +{ + int cpu; + int ret = 0; + + for_each_online_cpu(cpu) { + ret = ftrace_profile_init_cpu(cpu); + if (ret) + break; + } + + return ret; +} + +/* interrupts must be disabled */ +static struct ftrace_profile * +ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) +{ + struct ftrace_profile *rec; + struct hlist_head *hhd; + struct hlist_node *n; + unsigned long key; + + key = hash_long(ip, ftrace_profile_bits); + hhd = &stat->hash[key]; + + if (hlist_empty(hhd)) + return NULL; + + hlist_for_each_entry_rcu(rec, n, hhd, node) { + if (rec->ip == ip) + return rec; + } + + return NULL; +} + +static void ftrace_add_profile(struct ftrace_profile_stat *stat, + struct ftrace_profile *rec) +{ + unsigned long key; + + key = hash_long(rec->ip, ftrace_profile_bits); + hlist_add_head_rcu(&rec->node, &stat->hash[key]); +} + +/* + * The memory is already allocated, this simply finds a new record to use. + */ +static struct ftrace_profile * +ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) +{ + struct ftrace_profile *rec = NULL; + + /* prevent recursion (from NMIs) */ + if (atomic_inc_return(&stat->disabled) != 1) + goto out; + + /* + * Try to find the function again since an NMI + * could have added it + */ + rec = ftrace_find_profiled_func(stat, ip); + if (rec) + goto out; + + if (stat->pages->index == PROFILES_PER_PAGE) { + if (!stat->pages->next) + goto out; + stat->pages = stat->pages->next; + } + + rec = &stat->pages->records[stat->pages->index++]; + rec->ip = ip; + ftrace_add_profile(stat, rec); + + out: + atomic_dec(&stat->disabled); + + return rec; +} + +static void +function_profile_call(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_profile_stat *stat; + struct ftrace_profile *rec; + unsigned long flags; + + if (!ftrace_profile_enabled) + return; + + local_irq_save(flags); + + stat = &__get_cpu_var(ftrace_profile_stats); + if (!stat->hash || !ftrace_profile_enabled) + goto out; + + rec = ftrace_find_profiled_func(stat, ip); + if (!rec) { + rec = ftrace_profile_alloc(stat, ip); + if (!rec) + goto out; + } + + rec->counter++; + out: + local_irq_restore(flags); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int profile_graph_entry(struct ftrace_graph_ent *trace) +{ + function_profile_call(trace->func, 0); + return 1; +} + +static void profile_graph_return(struct ftrace_graph_ret *trace) +{ + struct ftrace_profile_stat *stat; + unsigned long long calltime; + struct ftrace_profile *rec; + unsigned long flags; + + local_irq_save(flags); + stat = &__get_cpu_var(ftrace_profile_stats); + if (!stat->hash || !ftrace_profile_enabled) + goto out; + + calltime = trace->rettime - trace->calltime; + + if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { + int index; + + index = trace->depth; + + /* Append this call time to the parent time to subtract */ + if (index) + current->ret_stack[index - 1].subtime += calltime; + + if (current->ret_stack[index].subtime < calltime) + calltime -= current->ret_stack[index].subtime; + else + calltime = 0; + } + + rec = ftrace_find_profiled_func(stat, trace->func); + if (rec) + rec->time += calltime; + + out: + local_irq_restore(flags); +} + +static int register_ftrace_profiler(void) +{ + return register_ftrace_graph(&profile_graph_return, + &profile_graph_entry); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_graph(); +} +#else +static struct ftrace_ops ftrace_profile_ops __read_mostly = +{ + .func = function_profile_call, +}; + +static int register_ftrace_profiler(void) +{ + return register_ftrace_function(&ftrace_profile_ops); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_function(&ftrace_profile_ops); +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +static ssize_t +ftrace_profile_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + char buf[64]; /* big enough to hold a number */ + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + + mutex_lock(&ftrace_profile_lock); + if (ftrace_profile_enabled ^ val) { + if (val) { + ret = ftrace_profile_init(); + if (ret < 0) { + cnt = ret; + goto out; + } + + ret = register_ftrace_profiler(); + if (ret < 0) { + cnt = ret; + goto out; + } + ftrace_profile_enabled = 1; + } else { + ftrace_profile_enabled = 0; + /* + * unregister_ftrace_profiler calls stop_machine + * so this acts like an synchronize_sched. + */ + unregister_ftrace_profiler(); + } + } + out: + mutex_unlock(&ftrace_profile_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static ssize_t +ftrace_profile_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; /* big enough to hold a number */ + int r; + + r = sprintf(buf, "%u\n", ftrace_profile_enabled); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static const struct file_operations ftrace_profile_fops = { + .open = tracing_open_generic, + .read = ftrace_profile_read, + .write = ftrace_profile_write, +}; + +/* used to initialize the real stat files */ +static struct tracer_stat function_stats __initdata = { + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show +}; + +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ + struct ftrace_profile_stat *stat; + struct dentry *entry; + char *name; + int ret; + int cpu; + + for_each_possible_cpu(cpu) { + stat = &per_cpu(ftrace_profile_stats, cpu); + + /* allocate enough for function name + cpu number */ + name = kmalloc(32, GFP_KERNEL); + if (!name) { + /* + * The files created are permanent, if something happens + * we still do not free memory. + */ + kfree(stat); + WARN(1, + "Could not allocate stat file for cpu %d\n", + cpu); + return; + } + stat->stat = function_stats; + snprintf(name, 32, "function%d", cpu); + stat->stat.name = name; + ret = register_stat_tracer(&stat->stat); + if (ret) { + WARN(1, + "Could not register function stat for cpu %d\n", + cpu); + kfree(name); + return; + } + } + + entry = debugfs_create_file("function_profile_enabled", 0644, + d_tracer, NULL, &ftrace_profile_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'function_profile_enabled' entry\n"); +} + +#else /* CONFIG_FUNCTION_PROFILER */ +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ +} +#endif /* CONFIG_FUNCTION_PROFILER */ + /* set when tracing only a pid */ struct pid *ftrace_pid_trace; static struct pid * const ftrace_swapper_pid = &init_struct_pid; @@ -261,7 +837,6 @@ struct ftrace_func_probe { struct rcu_head rcu; }; - enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -346,30 +921,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec) rec->flags |= FTRACE_FL_FREE; } -void ftrace_release(void *start, unsigned long size) -{ - struct dyn_ftrace *rec; - struct ftrace_page *pg; - unsigned long s = (unsigned long)start; - unsigned long e = s + size; - - if (ftrace_disabled || !start) - return; - - mutex_lock(&ftrace_lock); - do_for_each_ftrace_rec(pg, rec) { - if ((rec->ip >= s) && (rec->ip < e)) { - /* - * rec->ip is changed in ftrace_free_rec() - * It should not between s and e if record was freed. - */ - FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); - ftrace_free_rec(rec); - } - } while_for_each_ftrace_rec(); - mutex_unlock(&ftrace_lock); -} - static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { struct dyn_ftrace *rec; @@ -1408,7 +1959,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_probe_ops __read_mostly = { - .func = function_trace_probe_call, + .func = function_trace_probe_call, }; static int ftrace_probe_registered; @@ -1823,6 +2374,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset) ftrace_set_regex(buf, len, reset, 0); } +/* + * command line interface to allow users to set filters on boot up. + */ +#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE +static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; + +static int __init set_ftrace_notrace(char *str) +{ + strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_notrace=", set_ftrace_notrace); + +static int __init set_ftrace_filter(char *str) +{ + strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_filter=", set_ftrace_filter); + +static void __init set_ftrace_early_filter(char *buf, int enable) +{ + char *func; + + while (buf) { + func = strsep(&buf, ","); + ftrace_set_regex(func, strlen(func), 0, enable); + } +} + +static void __init set_ftrace_early_filters(void) +{ + if (ftrace_filter_buf[0]) + set_ftrace_early_filter(ftrace_filter_buf, 1); + if (ftrace_notrace_buf[0]) + set_ftrace_early_filter(ftrace_notrace_buf, 0); +} + static int ftrace_regex_release(struct inode *inode, struct file *file, int enable) { @@ -2128,38 +2718,23 @@ static const struct file_operations ftrace_graph_fops = { static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { - struct dentry *entry; - entry = debugfs_create_file("available_filter_functions", 0444, - d_tracer, NULL, &ftrace_avail_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'available_filter_functions' entry\n"); + trace_create_file("available_filter_functions", 0444, + d_tracer, NULL, &ftrace_avail_fops); - entry = debugfs_create_file("failures", 0444, - d_tracer, NULL, &ftrace_failures_fops); - if (!entry) - pr_warning("Could not create debugfs 'failures' entry\n"); + trace_create_file("failures", 0444, + d_tracer, NULL, &ftrace_failures_fops); - entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, - NULL, &ftrace_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_filter' entry\n"); + trace_create_file("set_ftrace_filter", 0644, d_tracer, + NULL, &ftrace_filter_fops); - entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, + trace_create_file("set_ftrace_notrace", 0644, d_tracer, NULL, &ftrace_notrace_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_notrace' entry\n"); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - entry = debugfs_create_file("set_graph_function", 0444, d_tracer, + trace_create_file("set_graph_function", 0444, d_tracer, NULL, &ftrace_graph_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_graph_function' entry\n"); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ return 0; @@ -2197,14 +2772,72 @@ static int ftrace_convert_nops(struct module *mod, return 0; } -void ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end) +#ifdef CONFIG_MODULES +void ftrace_release(void *start, void *end) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + unsigned long s = (unsigned long)start; + unsigned long e = (unsigned long)end; + + if (ftrace_disabled || !start || start == end) + return; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_rec(pg, rec) { + if ((rec->ip >= s) && (rec->ip < e)) { + /* + * rec->ip is changed in ftrace_free_rec() + * It should not between s and e if record was freed. + */ + FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); + ftrace_free_rec(rec); + } + } while_for_each_ftrace_rec(); + mutex_unlock(&ftrace_lock); +} + +static void ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end) { if (ftrace_disabled || start == end) return; ftrace_convert_nops(mod, start, end); } +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + ftrace_init_module(mod, mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); + break; + case MODULE_STATE_GOING: + ftrace_release(mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); + break; + } + + return 0; +} +#else +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + +struct notifier_block ftrace_module_nb = { + .notifier_call = ftrace_module_notify, + .priority = 0, +}; + extern unsigned long __start_mcount_loc[]; extern unsigned long __stop_mcount_loc[]; @@ -2236,6 +2869,12 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); + ret = register_module_notifier(&ftrace_module_nb); + if (ret) + pr_warning("Failed to register trace ftrace module notifier\n"); + + set_ftrace_early_filters(); + return; failed: ftrace_disabled = 1; @@ -2417,7 +3056,6 @@ static const struct file_operations ftrace_pid_fops = { static __init int ftrace_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -2425,11 +3063,11 @@ static __init int ftrace_init_debugfs(void) ftrace_init_dyn_debugfs(d_tracer); - entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer, - NULL, &ftrace_pid_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_pid' entry\n"); + trace_create_file("set_ftrace_pid", 0644, d_tracer, + NULL, &ftrace_pid_fops); + + ftrace_profile_debugfs(d_tracer); + return 0; } fs_initcall(ftrace_init_debugfs); @@ -2538,7 +3176,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static atomic_t ftrace_graph_active; +static int ftrace_graph_active; static struct notifier_block ftrace_suspend_notifier; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) @@ -2580,12 +3218,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } if (t->ret_stack == NULL) { - t->curr_ret_stack = -1; - /* Make sure IRQs see the -1 first: */ - barrier(); - t->ret_stack = ret_stack_list[start++]; atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); + t->curr_ret_stack = -1; + /* Make sure the tasks see the -1 first: */ + smp_wmb(); + t->ret_stack = ret_stack_list[start++]; } } while_each_thread(g, t); @@ -2643,8 +3281,10 @@ static int start_graph_tracing(void) return -ENOMEM; /* The cpu_boot init_task->ret_stack will never be freed */ - for_each_online_cpu(cpu) - ftrace_graph_init_task(idle_task(cpu)); + for_each_online_cpu(cpu) { + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_task(idle_task(cpu)); + } do { ret = alloc_retstack_tasklist(ret_stack_list); @@ -2690,7 +3330,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, mutex_lock(&ftrace_lock); /* we currently allow only one tracer registered at a time */ - if (atomic_read(&ftrace_graph_active)) { + if (ftrace_graph_active) { ret = -EBUSY; goto out; } @@ -2698,10 +3338,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; register_pm_notifier(&ftrace_suspend_notifier); - atomic_inc(&ftrace_graph_active); + ftrace_graph_active++; ret = start_graph_tracing(); if (ret) { - atomic_dec(&ftrace_graph_active); + ftrace_graph_active--; goto out; } @@ -2719,10 +3359,10 @@ void unregister_ftrace_graph(void) { mutex_lock(&ftrace_lock); - if (!unlikely(atomic_read(&ftrace_graph_active))) + if (unlikely(!ftrace_graph_active)) goto out; - atomic_dec(&ftrace_graph_active); + ftrace_graph_active--; unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; @@ -2736,18 +3376,25 @@ void unregister_ftrace_graph(void) /* Allocate a return stack for newly created task */ void ftrace_graph_init_task(struct task_struct *t) { - if (atomic_read(&ftrace_graph_active)) { - t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + /* Make sure we do not use the parent ret_stack */ + t->ret_stack = NULL; + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH * sizeof(struct ftrace_ret_stack), GFP_KERNEL); - if (!t->ret_stack) + if (!ret_stack) return; t->curr_ret_stack = -1; atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->ftrace_timestamp = 0; - } else - t->ret_stack = NULL; + /* make curr_ret_stack visable before we add the ret_stack */ + smp_wmb(); + t->ret_stack = ret_stack; + } } void ftrace_graph_exit_task(struct task_struct *t) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 5011f4d..86cdf67 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -12,7 +12,7 @@ #include <linux/dcache.h> #include <linux/fs.h> -#include <trace/kmemtrace.h> +#include <linux/kmemtrace.h> #include "trace_output.h" #include "trace.h" @@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, gfp_t gfp_flags, int node) { + struct ftrace_event_call *call = &event_kmem_alloc; struct trace_array *tr = kmemtrace_array; struct kmemtrace_alloc_entry *entry; struct ring_buffer_event *event; @@ -62,7 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, entry->gfp_flags = gfp_flags; entry->node = node; - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); } @@ -71,6 +73,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, unsigned long call_site, const void *ptr) { + struct ftrace_event_call *call = &event_kmem_free; struct trace_array *tr = kmemtrace_array; struct kmemtrace_free_entry *entry; struct ring_buffer_event *event; @@ -86,7 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, entry->call_site = call_site; entry->ptr = ptr; - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 960cbf4..2e642b2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -22,6 +22,28 @@ #include "trace.h" /* + * The ring buffer header is special. We must manually up keep it. + */ +int ring_buffer_print_entry_header(struct trace_seq *s) +{ + int ret; + + ret = trace_seq_printf(s, "# compressed entry header\n"); + ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); + ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); + ret = trace_seq_printf(s, "\tarray : 32 bits\n"); + ret = trace_seq_printf(s, "\n"); + ret = trace_seq_printf(s, "\tpadding : type == %d\n", + RINGBUF_TYPE_PADDING); + ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", + RINGBUF_TYPE_TIME_EXTEND); + ret = trace_seq_printf(s, "\tdata max type_len == %d\n", + RINGBUF_TYPE_DATA_TYPE_LEN_MAX); + + return ret; +} + +/* * The ring buffer is made up of a list of pages. A separate list of pages is * allocated for each CPU. A writer may only write to a buffer that is * associated with the CPU it is currently executing on. A reader may read @@ -182,7 +204,10 @@ EXPORT_SYMBOL_GPL(tracing_is_on); #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA 28 +#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) + +/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ +#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX enum { RB_LEN_TIME_EXTEND = 8, @@ -191,48 +216,28 @@ enum { static inline int rb_null_event(struct ring_buffer_event *event) { - return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0; + return event->type_len == RINGBUF_TYPE_PADDING + && event->time_delta == 0; } static inline int rb_discarded_event(struct ring_buffer_event *event) { - return event->type == RINGBUF_TYPE_PADDING && event->time_delta; + return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta; } static void rb_event_set_padding(struct ring_buffer_event *event) { - event->type = RINGBUF_TYPE_PADDING; + event->type_len = RINGBUF_TYPE_PADDING; event->time_delta = 0; } -/** - * ring_buffer_event_discard - discard an event in the ring buffer - * @buffer: the ring buffer - * @event: the event to discard - * - * Sometimes a event that is in the ring buffer needs to be ignored. - * This function lets the user discard an event in the ring buffer - * and then that event will not be read later. - * - * Note, it is up to the user to be careful with this, and protect - * against races. If the user discards an event that has been consumed - * it is possible that it could corrupt the ring buffer. - */ -void ring_buffer_event_discard(struct ring_buffer_event *event) -{ - event->type = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; -} - static unsigned rb_event_data_length(struct ring_buffer_event *event) { unsigned length; - if (event->len) - length = event->len * RB_ALIGNMENT; + if (event->type_len) + length = event->type_len * RB_ALIGNMENT; else length = event->array[0]; return length + RB_EVNT_HDR_SIZE; @@ -242,12 +247,12 @@ rb_event_data_length(struct ring_buffer_event *event) static unsigned rb_event_length(struct ring_buffer_event *event) { - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) /* undefined */ return -1; - return rb_event_data_length(event); + return event->array[0] + RB_EVNT_HDR_SIZE; case RINGBUF_TYPE_TIME_EXTEND: return RB_LEN_TIME_EXTEND; @@ -271,7 +276,7 @@ rb_event_length(struct ring_buffer_event *event) unsigned ring_buffer_event_length(struct ring_buffer_event *event) { unsigned length = rb_event_length(event); - if (event->type != RINGBUF_TYPE_DATA) + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) @@ -284,9 +289,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); static void * rb_event_data(struct ring_buffer_event *event) { - BUG_ON(event->type != RINGBUF_TYPE_DATA); + BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ - if (event->len) + if (event->type_len) return (void *)&event->array[0]; /* Otherwise length is in array[0] and array[1] has the data */ return (void *)&event->array[1]; @@ -316,9 +321,10 @@ struct buffer_data_page { }; struct buffer_page { + struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ unsigned read; /* index for next read */ - struct list_head list; /* list of free pages */ + local_t entries; /* entries on this page */ struct buffer_data_page *page; /* Actual data page */ }; @@ -361,6 +367,34 @@ static inline int test_time_stamp(u64 delta) #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) +/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ +#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) + +/* Max number of timestamps that can fit on a page */ +#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) + +int ring_buffer_print_page_header(struct trace_seq *s) +{ + struct buffer_data_page field; + int ret; + + ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\n", + (unsigned int)sizeof(field.time_stamp)); + + ret = trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit)); + + ret = trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)BUF_PAGE_SIZE); + + return ret; +} + /* * head_page == tail_page && head == tail then buffer is empty. */ @@ -375,8 +409,11 @@ struct ring_buffer_per_cpu { struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; + unsigned long nmi_dropped; + unsigned long commit_overrun; unsigned long overrun; - unsigned long entries; + unsigned long read; + local_t entries; u64 write_stamp; u64 read_stamp; atomic_t record_disabled; @@ -389,6 +426,8 @@ struct ring_buffer { atomic_t record_disabled; cpumask_var_t cpumask; + struct lock_class_key *reader_lock_key; + struct mutex mutex; struct ring_buffer_per_cpu **buffers; @@ -420,13 +459,18 @@ struct ring_buffer_iter { /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 +static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) +{ + /* shift to debug/test normalization and TIME_EXTENTS */ + return buffer->clock() << DEBUG_SHIFT; +} + u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) { u64 time; preempt_disable_notrace(); - /* shift to debug/test normalization and TIME_EXTENTS */ - time = buffer->clock() << DEBUG_SHIFT; + time = rb_time_stamp(buffer, cpu); preempt_enable_no_resched_notrace(); return time; @@ -523,6 +567,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; spin_lock_init(&cpu_buffer->reader_lock); + lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; INIT_LIST_HEAD(&cpu_buffer->pages); @@ -593,7 +638,8 @@ static int rb_cpu_notify(struct notifier_block *self, * when the buffer wraps. If this flag is not set, the buffer will * drop data when the tail hits the head. */ -struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) +struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, + struct lock_class_key *key) { struct ring_buffer *buffer; int bsize; @@ -616,6 +662,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); buffer->flags = flags; buffer->clock = trace_clock_local; + buffer->reader_lock_key = key; /* need at least two pages */ if (buffer->pages == 1) @@ -673,7 +720,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) kfree(buffer); return NULL; } -EXPORT_SYMBOL_GPL(ring_buffer_alloc); +EXPORT_SYMBOL_GPL(__ring_buffer_alloc); /** * ring_buffer_free - free a ring buffer. @@ -947,31 +994,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) return rb_page_commit(cpu_buffer->head_page); } -/* - * When the tail hits the head and the buffer is in overwrite mode, - * the head jumps to the next page and all content on the previous - * page is discarded. But before doing so, we update the overrun - * variable of the buffer. - */ -static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) -{ - struct ring_buffer_event *event; - unsigned long head; - - for (head = 0; head < rb_head_size(cpu_buffer); - head += rb_event_length(event)) { - - event = __rb_page_index(cpu_buffer->head_page, head); - if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) - return; - /* Only count data entries */ - if (event->type != RINGBUF_TYPE_DATA) - continue; - cpu_buffer->overrun++; - cpu_buffer->entries--; - } -} - static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page **bpage) { @@ -991,7 +1013,7 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); } -static int +static inline int rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { @@ -1110,28 +1132,21 @@ static void rb_update_event(struct ring_buffer_event *event, unsigned type, unsigned length) { - event->type = type; + event->type_len = type; switch (type) { case RINGBUF_TYPE_PADDING: - break; - case RINGBUF_TYPE_TIME_EXTEND: - event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT); - break; - case RINGBUF_TYPE_TIME_STAMP: - event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT); break; - case RINGBUF_TYPE_DATA: + case 0: length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA) { - event->len = 0; + if (length > RB_MAX_SMALL_DATA) event->array[0] = length; - } else - event->len = DIV_ROUND_UP(length, RB_ALIGNMENT); + else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); break; default: BUG(); @@ -1155,131 +1170,156 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } + static struct ring_buffer_event * -__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length, u64 *ts) +rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length, unsigned long tail, + struct buffer_page *commit_page, + struct buffer_page *tail_page, u64 *ts) { - struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; - unsigned long tail, write; + struct buffer_page *next_page, *head_page, *reader_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; - unsigned long flags; bool lock_taken = false; + unsigned long flags; - commit_page = cpu_buffer->commit_page; - /* we just need to protect against interrupts */ - barrier(); - tail_page = cpu_buffer->tail_page; - write = local_add_return(length, &tail_page->write); - tail = write - length; + next_page = tail_page; - /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) { - struct buffer_page *next_page = tail_page; + local_irq_save(flags); + /* + * Since the write to the buffer is still not + * fully lockless, we must be careful with NMIs. + * The locks in the writers are taken when a write + * crosses to a new page. The locks protect against + * races with the readers (this will soon be fixed + * with a lockless solution). + * + * Because we can not protect against NMIs, and we + * want to keep traces reentrant, we need to manage + * what happens when we are in an NMI. + * + * NMIs can happen after we take the lock. + * If we are in an NMI, only take the lock + * if it is not already taken. Otherwise + * simply fail. + */ + if (unlikely(in_nmi())) { + if (!__raw_spin_trylock(&cpu_buffer->lock)) { + cpu_buffer->nmi_dropped++; + goto out_reset; + } + } else + __raw_spin_lock(&cpu_buffer->lock); - local_irq_save(flags); - /* - * Since the write to the buffer is still not - * fully lockless, we must be careful with NMIs. - * The locks in the writers are taken when a write - * crosses to a new page. The locks protect against - * races with the readers (this will soon be fixed - * with a lockless solution). - * - * Because we can not protect against NMIs, and we - * want to keep traces reentrant, we need to manage - * what happens when we are in an NMI. - * - * NMIs can happen after we take the lock. - * If we are in an NMI, only take the lock - * if it is not already taken. Otherwise - * simply fail. - */ - if (unlikely(in_nmi())) { - if (!__raw_spin_trylock(&cpu_buffer->lock)) - goto out_reset; - } else - __raw_spin_lock(&cpu_buffer->lock); + lock_taken = true; - lock_taken = true; + rb_inc_page(cpu_buffer, &next_page); - rb_inc_page(cpu_buffer, &next_page); + head_page = cpu_buffer->head_page; + reader_page = cpu_buffer->reader_page; - head_page = cpu_buffer->head_page; - reader_page = cpu_buffer->reader_page; + /* we grabbed the lock before incrementing */ + if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) + goto out_reset; - /* we grabbed the lock before incrementing */ - if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) - goto out_reset; + /* + * If for some reason, we had an interrupt storm that made + * it all the way around the buffer, bail, and warn + * about it. + */ + if (unlikely(next_page == commit_page)) { + cpu_buffer->commit_overrun++; + goto out_reset; + } - /* - * If for some reason, we had an interrupt storm that made - * it all the way around the buffer, bail, and warn - * about it. - */ - if (unlikely(next_page == commit_page)) { - WARN_ON_ONCE(1); + if (next_page == head_page) { + if (!(buffer->flags & RB_FL_OVERWRITE)) goto out_reset; - } - if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) - goto out_reset; - - /* tail_page has not moved yet? */ - if (tail_page == cpu_buffer->tail_page) { - /* count overflows */ - rb_update_overflow(cpu_buffer); + /* tail_page has not moved yet? */ + if (tail_page == cpu_buffer->tail_page) { + /* count overflows */ + cpu_buffer->overrun += + local_read(&head_page->entries); - rb_inc_page(cpu_buffer, &head_page); - cpu_buffer->head_page = head_page; - cpu_buffer->head_page->read = 0; - } + rb_inc_page(cpu_buffer, &head_page); + cpu_buffer->head_page = head_page; + cpu_buffer->head_page->read = 0; } + } - /* - * If the tail page is still the same as what we think - * it is, then it is up to us to update the tail - * pointer. - */ - if (tail_page == cpu_buffer->tail_page) { - local_set(&next_page->write, 0); - local_set(&next_page->page->commit, 0); - cpu_buffer->tail_page = next_page; + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + local_set(&next_page->write, 0); + local_set(&next_page->entries, 0); + local_set(&next_page->page->commit, 0); + cpu_buffer->tail_page = next_page; + + /* reread the time stamp */ + *ts = rb_time_stamp(buffer, cpu_buffer->cpu); + cpu_buffer->tail_page->page->time_stamp = *ts; + } - /* reread the time stamp */ - *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); - cpu_buffer->tail_page->page->time_stamp = *ts; - } + /* + * The actual tail page has moved forward. + */ + if (tail < BUF_PAGE_SIZE) { + /* Mark the rest of the page with padding */ + event = __rb_page_index(tail_page, tail); + rb_event_set_padding(event); + } - /* - * The actual tail page has moved forward. - */ - if (tail < BUF_PAGE_SIZE) { - /* Mark the rest of the page with padding */ - event = __rb_page_index(tail_page, tail); - rb_event_set_padding(event); - } + /* Set the write back to the previous setting */ + local_sub(length, &tail_page->write); - if (tail <= BUF_PAGE_SIZE) - /* Set the write back to the previous setting */ - local_set(&tail_page->write, tail); + /* + * If this was a commit entry that failed, + * increment that too + */ + if (tail_page == cpu_buffer->commit_page && + tail == rb_commit_index(cpu_buffer)) { + rb_set_commit_to_write(cpu_buffer); + } - /* - * If this was a commit entry that failed, - * increment that too - */ - if (tail_page == cpu_buffer->commit_page && - tail == rb_commit_index(cpu_buffer)) { - rb_set_commit_to_write(cpu_buffer); - } + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + /* fail and let the caller try again */ + return ERR_PTR(-EAGAIN); + + out_reset: + /* reset write */ + local_sub(length, &tail_page->write); + if (likely(lock_taken)) __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); + local_irq_restore(flags); + return NULL; +} - /* fail and let the caller try again */ - return ERR_PTR(-EAGAIN); - } +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + unsigned type, unsigned long length, u64 *ts) +{ + struct buffer_page *tail_page, *commit_page; + struct ring_buffer_event *event; + unsigned long tail, write; + + commit_page = cpu_buffer->commit_page; + /* we just need to protect against interrupts */ + barrier(); + tail_page = cpu_buffer->tail_page; + write = local_add_return(length, &tail_page->write); + tail = write - length; + + /* See if we shot pass the end of this buffer page */ + if (write > BUF_PAGE_SIZE) + return rb_move_tail(cpu_buffer, length, tail, + commit_page, tail_page, ts); /* We reserved something on the buffer */ @@ -1289,6 +1329,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); rb_update_event(event, type, length); + /* The passed in type is zero for DATA */ + if (likely(!type)) + local_inc(&tail_page->entries); + /* * If this is a commit and the tail is zero, then update * this page's time stamp. @@ -1297,16 +1341,38 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->commit_page->page->time_stamp = *ts; return event; +} - out_reset: - /* reset write */ - if (tail <= BUF_PAGE_SIZE) - local_set(&tail_page->write, tail); +static inline int +rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long index; + unsigned long addr; - if (likely(lock_taken)) - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); - return NULL; + new_index = rb_event_index(event); + old_index = new_index + rb_event_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = cpu_buffer->tail_page; + + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + index = local_cmpxchg(&bpage->write, old_index, new_index); + if (index == old_index) + return 1; + } + + /* could not discard */ + return 0; } static int @@ -1351,16 +1417,23 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, event->array[0] = *delta >> TS_SHIFT; } else { cpu_buffer->commit_page->page->time_stamp = *ts; - event->time_delta = 0; - event->array[0] = 0; + /* try to discard, since we do not need this */ + if (!rb_try_to_discard(cpu_buffer, event)) { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } } cpu_buffer->write_stamp = *ts; /* let the caller know this was the commit */ ret = 1; } else { - /* Darn, this is just wasted space */ - event->time_delta = 0; - event->array[0] = 0; + /* Try to discard the event */ + if (!rb_try_to_discard(cpu_buffer, event)) { + /* Darn, this is just wasted space */ + event->time_delta = 0; + event->array[0] = 0; + } ret = 0; } @@ -1371,13 +1444,14 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length) + unsigned long length) { struct ring_buffer_event *event; - u64 ts, delta; + u64 ts, delta = 0; int commit = 0; int nr_loops = 0; + length = rb_calculate_event_length(length); again: /* * We allow for interrupts to reenter here and do a trace. @@ -1391,7 +1465,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) return NULL; - ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); + ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); /* * Only the first commit can update the timestamp. @@ -1401,23 +1475,24 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * also be made. But only the entry that did the actual * commit will be something other than zero. */ - if (cpu_buffer->tail_page == cpu_buffer->commit_page && - rb_page_write(cpu_buffer->tail_page) == - rb_commit_index(cpu_buffer)) { + if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && + rb_page_write(cpu_buffer->tail_page) == + rb_commit_index(cpu_buffer))) { + u64 diff; - delta = ts - cpu_buffer->write_stamp; + diff = ts - cpu_buffer->write_stamp; - /* make sure this delta is calculated here */ + /* make sure this diff is calculated here */ barrier(); /* Did the write stamp get updated already? */ if (unlikely(ts < cpu_buffer->write_stamp)) - delta = 0; + goto get_event; - if (test_time_stamp(delta)) { + delta = diff; + if (unlikely(test_time_stamp(delta))) { commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); - if (commit == -EBUSY) return NULL; @@ -1426,12 +1501,11 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, RB_WARN_ON(cpu_buffer, commit < 0); } - } else - /* Non commits have zero deltas */ - delta = 0; + } - event = __rb_reserve_next(cpu_buffer, type, length, &ts); - if (PTR_ERR(event) == -EAGAIN) + get_event: + event = __rb_reserve_next(cpu_buffer, 0, length, &ts); + if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; if (!event) { @@ -1448,7 +1522,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * If the timestamp was commited, make the commit our entry * now so that we will update it when needed. */ - if (commit) + if (unlikely(commit)) rb_set_commit_event(cpu_buffer, event); else if (!rb_is_commit(cpu_buffer, event)) delta = 0; @@ -1458,6 +1532,36 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return event; } +#define TRACE_RECURSIVE_DEPTH 16 + +static int trace_recursive_lock(void) +{ + current->trace_recursion++; + + if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) + return 0; + + /* Disable all tracing before we do anything else */ + tracing_off_permanent(); + + printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" + "HC[%lu]:SC[%lu]:NMI[%lu]\n", + current->trace_recursion, + hardirq_count() >> HARDIRQ_SHIFT, + softirq_count() >> SOFTIRQ_SHIFT, + in_nmi()); + + WARN_ON_ONCE(1); + return -1; +} + +static void trace_recursive_unlock(void) +{ + WARN_ON_ONCE(!current->trace_recursion); + + current->trace_recursion--; +} + static DEFINE_PER_CPU(int, rb_need_resched); /** @@ -1491,6 +1595,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) /* If we are tracing schedule, we don't want to recurse */ resched = ftrace_preempt_disable(); + if (trace_recursive_lock()) + goto out_nocheck; + cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) @@ -1501,11 +1608,10 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (atomic_read(&cpu_buffer->record_disabled)) goto out; - length = rb_calculate_event_length(length); - if (length > BUF_PAGE_SIZE) + if (length > BUF_MAX_DATA_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); + event = rb_reserve_next_event(cpu_buffer, length); if (!event) goto out; @@ -1520,6 +1626,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) return event; out: + trace_recursive_unlock(); + + out_nocheck: ftrace_preempt_enable(resched); return NULL; } @@ -1528,7 +1637,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { - cpu_buffer->entries++; + local_inc(&cpu_buffer->entries); /* Only process further if we own the commit */ if (!rb_is_commit(cpu_buffer, event)) @@ -1558,6 +1667,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); + trace_recursive_unlock(); + /* * Only the last preempt count needs to restore preemption. */ @@ -1570,6 +1681,99 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); +static inline void rb_event_discard(struct ring_buffer_event *event) +{ + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} + +/** + * ring_buffer_event_discard - discard any event in the ring buffer + * @event: the event to discard + * + * Sometimes a event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * Note, it is up to the user to be careful with this, and protect + * against races. If the user discards an event that has been consumed + * it is possible that it could corrupt the ring buffer. + */ +void ring_buffer_event_discard(struct ring_buffer_event *event) +{ + rb_event_discard(event); +} +EXPORT_SYMBOL_GPL(ring_buffer_event_discard); + +/** + * ring_buffer_commit_discard - discard an event that has not been committed + * @buffer: the ring buffer + * @event: non committed event to discard + * + * This is similar to ring_buffer_event_discard but must only be + * performed on an event that has not been committed yet. The difference + * is that this will also try to free the event from the ring buffer + * if another event has not been added behind it. + * + * If another event has been added behind it, it will set the event + * up as discarded, and perform the commit. + * + * If this function is called, do not call ring_buffer_unlock_commit on + * the event. + */ +void ring_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* The event is discarded regardless */ + rb_event_discard(event); + + /* + * This must only be called if the event has not been + * committed yet. Thus we can assume that preemption + * is still disabled. + */ + RB_WARN_ON(buffer, preemptible()); + + cpu = smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + + if (!rb_try_to_discard(cpu_buffer, event)) + goto out; + + /* + * The commit is still visible by the reader, so we + * must increment entries. + */ + local_inc(&cpu_buffer->entries); + out: + /* + * If a write came in and pushed the tail page + * we still need to update the commit pointer + * if we were the commit. + */ + if (rb_is_commit(cpu_buffer, event)) + rb_set_commit_to_write(cpu_buffer); + + trace_recursive_unlock(); + + /* + * Only the last preempt count needs to restore preemption. + */ + if (preempt_count() == 1) + ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); + else + preempt_enable_no_resched_notrace(); + +} +EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); + /** * ring_buffer_write - write data to the buffer without reserving * @buffer: The ring buffer to write to. @@ -1589,7 +1793,6 @@ int ring_buffer_write(struct ring_buffer *buffer, { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; - unsigned long event_length; void *body; int ret = -EBUSY; int cpu, resched; @@ -1612,9 +1815,10 @@ int ring_buffer_write(struct ring_buffer *buffer, if (atomic_read(&cpu_buffer->record_disabled)) goto out; - event_length = rb_calculate_event_length(length); - event = rb_reserve_next_event(cpu_buffer, - RINGBUF_TYPE_DATA, event_length); + if (length > BUF_MAX_DATA_SIZE) + goto out; + + event = rb_reserve_next_event(cpu_buffer, length); if (!event) goto out; @@ -1728,7 +1932,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->entries; + ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) + - cpu_buffer->read; return ret; } @@ -1755,6 +1960,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); /** + * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = cpu_buffer->nmi_dropped; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu); + +/** + * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = cpu_buffer->commit_overrun; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); + +/** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer * @@ -1770,7 +2016,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - entries += cpu_buffer->entries; + entries += (local_read(&cpu_buffer->entries) - + cpu_buffer->overrun) - cpu_buffer->read; } return entries; @@ -1862,7 +2109,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, { u64 delta; - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; @@ -1893,7 +2140,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, { u64 delta; - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; @@ -1966,6 +2213,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->list.prev = reader->list.prev; local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); /* Make the reader page now replace the head */ @@ -2008,8 +2256,9 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) event = rb_reader_event(cpu_buffer); - if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event)) - cpu_buffer->entries--; + if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX + || rb_discarded_event(event)) + cpu_buffer->read++; rb_update_read_stamp(cpu_buffer, event); @@ -2031,8 +2280,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) * Check if we are at the end of the buffer. */ if (iter->head >= rb_page_size(iter->head_page)) { - if (RB_WARN_ON(buffer, - iter->head_page == cpu_buffer->commit_page)) + /* discarded commits can make the page empty */ + if (iter->head_page == cpu_buffer->commit_page) return; rb_inc_iter(iter); return; @@ -2075,12 +2324,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) /* * We repeat when a timestamp is encountered. It is possible * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written. The max times - * that this can happen is the number of nested interrupts we - * can have. Nesting 10 deep of interrupts is clearly - * an anomaly. + * as one timestamp is about to be written, or from discarded + * commits. The most that we can have is the number on a single page. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) return NULL; reader = rb_get_reader_page(cpu_buffer); @@ -2089,7 +2336,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) event = rb_reader_event(cpu_buffer); - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) RB_WARN_ON(cpu_buffer, 1); @@ -2146,14 +2393,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) again: /* - * We repeat when a timestamp is encountered. It is possible - * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written. The max times - * that this can happen is the number of nested interrupts we - * can have. Nesting 10 deep of interrupts is clearly - * an anomaly. + * We repeat when a timestamp is encountered. + * We can get multiple timestamps by nested interrupts or also + * if filtering is on (discarding commits). Since discarding + * commits can be frequent we can get a lot of timestamps. + * But we limit them by not adding timestamps if they begin + * at the start of a page. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) @@ -2161,7 +2408,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) event = rb_iter_head_event(iter); - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) { rb_inc_iter(iter); @@ -2220,7 +2467,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) event = rb_buffer_peek(buffer, cpu, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2248,7 +2495,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) event = rb_iter_peek(iter, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2293,7 +2540,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) out: preempt_enable(); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2386,7 +2633,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2411,6 +2658,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->head_page = list_entry(cpu_buffer->pages.next, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); + local_set(&cpu_buffer->head_page->entries, 0); local_set(&cpu_buffer->head_page->page->commit, 0); cpu_buffer->head_page->read = 0; @@ -2420,11 +2668,15 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) INIT_LIST_HEAD(&cpu_buffer->reader_page->list); local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; + cpu_buffer->nmi_dropped = 0; + cpu_buffer->commit_overrun = 0; cpu_buffer->overrun = 0; - cpu_buffer->entries = 0; + cpu_buffer->read = 0; + local_set(&cpu_buffer->entries, 0); cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; @@ -2443,6 +2695,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; + atomic_inc(&cpu_buffer->record_disabled); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); __raw_spin_lock(&cpu_buffer->lock); @@ -2452,6 +2706,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) __raw_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + atomic_dec(&cpu_buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -2578,28 +2834,6 @@ out: } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); -static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_data_page *bpage, - unsigned int offset) -{ - struct ring_buffer_event *event; - unsigned long head; - - __raw_spin_lock(&cpu_buffer->lock); - for (head = offset; head < local_read(&bpage->commit); - head += rb_event_length(event)) { - - event = __rb_data_page_index(bpage, head); - if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) - return; - /* Only count data entries */ - if (event->type != RINGBUF_TYPE_DATA) - continue; - cpu_buffer->entries--; - } - __raw_spin_unlock(&cpu_buffer->lock); -} - /** * ring_buffer_alloc_read_page - allocate a page to read from buffer * @buffer: the buffer to allocate for. @@ -2630,6 +2864,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) return bpage; } +EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); /** * ring_buffer_free_read_page - free an allocated read page @@ -2642,6 +2877,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) { free_page((unsigned long)data); } +EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); /** * ring_buffer_read_page - extract a page from the ring buffer @@ -2768,16 +3004,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer, /* we copied everything to the beginning */ read = 0; } else { + /* update the entry counter */ + cpu_buffer->read += local_read(&reader->entries); + /* swap the pages */ rb_init_page(bpage); bpage = reader->page; reader->page = *data_page; local_set(&reader->write, 0); + local_set(&reader->entries, 0); reader->read = 0; *data_page = bpage; - - /* update the entry counter */ - rb_remove_entries(cpu_buffer, bpage, read); } ret = read; @@ -2787,6 +3024,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, out: return ret; } +EXPORT_SYMBOL_GPL(ring_buffer_read_page); static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, @@ -2845,14 +3083,11 @@ static const struct file_operations rb_simple_fops = { static __init int rb_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("tracing_on", 0644, d_tracer, - &ring_buffer_flags, &rb_simple_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_on' entry\n"); + trace_create_file("tracing_on", 0644, d_tracer, + &ring_buffer_flags, &rb_simple_fops); return 0; } diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c new file mode 100644 index 0000000..8d68e14 --- /dev/null +++ b/kernel/trace/ring_buffer_benchmark.c @@ -0,0 +1,416 @@ +/* + * ring buffer tester and benchmark + * + * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/ring_buffer.h> +#include <linux/completion.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/time.h> + +struct rb_page { + u64 ts; + local_t commit; + char data[4080]; +}; + +/* run time and sleep time in seconds */ +#define RUN_TIME 10 +#define SLEEP_TIME 10 + +/* number of events for writer to wake up the reader */ +static int wakeup_interval = 100; + +static int reader_finish; +static struct completion read_start; +static struct completion read_done; + +static struct ring_buffer *buffer; +static struct task_struct *producer; +static struct task_struct *consumer; +static unsigned long read; + +static int disable_reader; +module_param(disable_reader, uint, 0644); +MODULE_PARM_DESC(disable_reader, "only run producer"); + +static int read_events; + +static int kill_test; + +#define KILL_TEST() \ + do { \ + if (!kill_test) { \ + kill_test = 1; \ + WARN_ON(1); \ + } \ + } while (0) + +enum event_status { + EVENT_FOUND, + EVENT_DROPPED, +}; + +static enum event_status read_event(int cpu) +{ + struct ring_buffer_event *event; + int *entry; + u64 ts; + + event = ring_buffer_consume(buffer, cpu, &ts); + if (!event) + return EVENT_DROPPED; + + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + return EVENT_DROPPED; + } + + read++; + return EVENT_FOUND; +} + +static enum event_status read_page(int cpu) +{ + struct ring_buffer_event *event; + struct rb_page *rpage; + unsigned long commit; + void *bpage; + int *entry; + int ret; + int inc; + int i; + + bpage = ring_buffer_alloc_read_page(buffer); + if (!bpage) + return EVENT_DROPPED; + + ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); + if (ret >= 0) { + rpage = bpage; + commit = local_read(&rpage->commit); + for (i = 0; i < commit && !kill_test; i += inc) { + + if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { + KILL_TEST(); + break; + } + + inc = -1; + event = (void *)&rpage->data[i]; + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + /* We don't expect any padding */ + KILL_TEST(); + break; + case RINGBUF_TYPE_TIME_EXTEND: + inc = 8; + break; + case 0: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + break; + } + read++; + if (!event->array[0]) { + KILL_TEST(); + break; + } + inc = event->array[0]; + break; + default: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + break; + } + read++; + inc = ((event->type_len + 1) * 4); + } + if (kill_test) + break; + + if (inc <= 0) { + KILL_TEST(); + break; + } + } + } + ring_buffer_free_read_page(buffer, bpage); + + if (ret < 0) + return EVENT_DROPPED; + return EVENT_FOUND; +} + +static void ring_buffer_consumer(void) +{ + /* toggle between reading pages and events */ + read_events ^= 1; + + read = 0; + while (!reader_finish && !kill_test) { + int found; + + do { + int cpu; + + found = 0; + for_each_online_cpu(cpu) { + enum event_status stat; + + if (read_events) + stat = read_event(cpu); + else + stat = read_page(cpu); + + if (kill_test) + break; + if (stat == EVENT_FOUND) + found = 1; + } + } while (found && !kill_test); + + set_current_state(TASK_INTERRUPTIBLE); + if (reader_finish) + break; + + schedule(); + __set_current_state(TASK_RUNNING); + } + reader_finish = 0; + complete(&read_done); +} + +static void ring_buffer_producer(void) +{ + struct timeval start_tv; + struct timeval end_tv; + unsigned long long time; + unsigned long long entries; + unsigned long long overruns; + unsigned long missed = 0; + unsigned long hit = 0; + unsigned long avg; + int cnt = 0; + + /* + * Hammer the buffer for 10 secs (this may + * make the system stall) + */ + pr_info("Starting ring buffer hammer\n"); + do_gettimeofday(&start_tv); + do { + struct ring_buffer_event *event; + int *entry; + + event = ring_buffer_lock_reserve(buffer, 10); + if (!event) { + missed++; + } else { + hit++; + entry = ring_buffer_event_data(event); + *entry = smp_processor_id(); + ring_buffer_unlock_commit(buffer, event); + } + do_gettimeofday(&end_tv); + + cnt++; + if (consumer && !(cnt % wakeup_interval)) + wake_up_process(consumer); + +#ifndef CONFIG_PREEMPT + /* + * If we are a non preempt kernel, the 10 second run will + * stop everything while it runs. Instead, we will call + * cond_resched and also add any time that was lost by a + * rescedule. + * + * Do a cond resched at the same frequency we would wake up + * the reader. + */ + if (cnt % wakeup_interval) + cond_resched(); +#endif + + } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); + pr_info("End ring buffer hammer\n"); + + if (consumer) { + /* Init both completions here to avoid races */ + init_completion(&read_start); + init_completion(&read_done); + /* the completions must be visible before the finish var */ + smp_wmb(); + reader_finish = 1; + /* finish var visible before waking up the consumer */ + smp_wmb(); + wake_up_process(consumer); + wait_for_completion(&read_done); + } + + time = end_tv.tv_sec - start_tv.tv_sec; + time *= USEC_PER_SEC; + time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec); + + entries = ring_buffer_entries(buffer); + overruns = ring_buffer_overruns(buffer); + + if (kill_test) + pr_info("ERROR!\n"); + pr_info("Time: %lld (usecs)\n", time); + pr_info("Overruns: %lld\n", overruns); + if (disable_reader) + pr_info("Read: (reader disabled)\n"); + else + pr_info("Read: %ld (by %s)\n", read, + read_events ? "events" : "pages"); + pr_info("Entries: %lld\n", entries); + pr_info("Total: %lld\n", entries + overruns + read); + pr_info("Missed: %ld\n", missed); + pr_info("Hit: %ld\n", hit); + + /* Convert time from usecs to millisecs */ + do_div(time, USEC_PER_MSEC); + if (time) + hit /= (long)time; + else + pr_info("TIME IS ZERO??\n"); + + pr_info("Entries per millisec: %ld\n", hit); + + if (hit) { + /* Calculate the average time in nanosecs */ + avg = NSEC_PER_MSEC / hit; + pr_info("%ld ns per entry\n", avg); + } + + if (missed) { + if (time) + missed /= (long)time; + + pr_info("Total iterations per millisec: %ld\n", hit + missed); + + /* it is possible that hit + missed will overflow and be zero */ + if (!(hit + missed)) { + pr_info("hit + missed overflowed and totalled zero!\n"); + hit--; /* make it non zero */ + } + + /* Caculate the average time in nanosecs */ + avg = NSEC_PER_MSEC / (hit + missed); + pr_info("%ld ns per entry\n", avg); + } +} + +static void wait_to_die(void) +{ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); +} + +static int ring_buffer_consumer_thread(void *arg) +{ + while (!kthread_should_stop() && !kill_test) { + complete(&read_start); + + ring_buffer_consumer(); + + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop() || kill_test) + break; + + schedule(); + __set_current_state(TASK_RUNNING); + } + __set_current_state(TASK_RUNNING); + + if (kill_test) + wait_to_die(); + + return 0; +} + +static int ring_buffer_producer_thread(void *arg) +{ + init_completion(&read_start); + + while (!kthread_should_stop() && !kill_test) { + ring_buffer_reset(buffer); + + if (consumer) { + smp_wmb(); + wake_up_process(consumer); + wait_for_completion(&read_start); + } + + ring_buffer_producer(); + + pr_info("Sleeping for 10 secs\n"); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ * SLEEP_TIME); + __set_current_state(TASK_RUNNING); + } + + if (kill_test) + wait_to_die(); + + return 0; +} + +static int __init ring_buffer_benchmark_init(void) +{ + int ret; + + /* make a one meg buffer in overwite mode */ + buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE); + if (!buffer) + return -ENOMEM; + + if (!disable_reader) { + consumer = kthread_create(ring_buffer_consumer_thread, + NULL, "rb_consumer"); + ret = PTR_ERR(consumer); + if (IS_ERR(consumer)) + goto out_fail; + } + + producer = kthread_run(ring_buffer_producer_thread, + NULL, "rb_producer"); + ret = PTR_ERR(producer); + + if (IS_ERR(producer)) + goto out_kill; + + return 0; + + out_kill: + if (consumer) + kthread_stop(consumer); + + out_fail: + ring_buffer_free(buffer); + return ret; +} + +static void __exit ring_buffer_benchmark_exit(void) +{ + kthread_stop(producer); + if (consumer) + kthread_stop(consumer); + ring_buffer_free(buffer); +} + +module_init(ring_buffer_benchmark_init); +module_exit(ring_buffer_benchmark_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("ring_buffer_benchmark"); +MODULE_LICENSE("GPL"); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a884c09..8acd9b8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -171,6 +171,13 @@ static struct trace_array global_trace; static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +int filter_current_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer_event *event) +{ + return filter_check_discard(call, rec, global_trace.buffer, event); +} +EXPORT_SYMBOL_GPL(filter_current_check_discard); + cycle_t ftrace_now(int cpu) { u64 ts; @@ -255,7 +262,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME; + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | + TRACE_ITER_GRAPH_TIME; /** * trace_wake_up - wake up tasks waiting for trace input @@ -317,6 +325,7 @@ static const char *trace_options[] = { "latency-format", "global-clock", "sleep-time", + "graph-time", NULL }; @@ -402,17 +411,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) return cnt; } -static void -trace_print_seq(struct seq_file *m, struct trace_seq *s) -{ - int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; - - s->buffer[len] = 0; - seq_puts(m, s->buffer); - - trace_seq_init(s); -} - /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer @@ -641,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr) tracing_reset(tr, cpu); } +void tracing_reset_current(int cpu) +{ + tracing_reset(&global_trace, cpu); +} + +void tracing_reset_current_online_cpus(void) +{ + tracing_reset_online_cpus(&global_trace); +} + #define SAVED_CMDLINES 128 #define NO_CMDLINE_MAP UINT_MAX static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; @@ -800,6 +808,7 @@ void trace_find_cmdline(int pid, char comm[]) return; } + preempt_disable(); __raw_spin_lock(&trace_cmdline_lock); map = map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) @@ -808,6 +817,7 @@ void trace_find_cmdline(int pid, char comm[]) strcpy(comm, "<...>"); __raw_spin_unlock(&trace_cmdline_lock); + preempt_enable(); } void tracing_record_cmdline(struct task_struct *tsk) @@ -840,7 +850,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, } struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - unsigned char type, + int type, unsigned long len, unsigned long flags, int pc) { @@ -883,30 +893,40 @@ void trace_buffer_unlock_commit(struct trace_array *tr, } struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, +trace_current_buffer_lock_reserve(int type, unsigned long len, unsigned long flags, int pc) { return trace_buffer_lock_reserve(&global_trace, type, len, flags, pc); } +EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { - return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); + __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); } +EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { - return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); + __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); +} +EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); + +void trace_current_buffer_discard_commit(struct ring_buffer_event *event) +{ + ring_buffer_discard_commit(global_trace.buffer, event); } +EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_function; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -921,7 +941,9 @@ trace_function(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; - ring_buffer_unlock_commit(tr->buffer, event); + + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -930,6 +952,7 @@ static int __trace_graph_entry(struct trace_array *tr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; struct ftrace_graph_ent_entry *entry; @@ -942,7 +965,8 @@ static int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; - ring_buffer_unlock_commit(global_trace.buffer, event); + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(global_trace.buffer, event); return 1; } @@ -952,6 +976,7 @@ static void __trace_graph_return(struct trace_array *tr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; struct ftrace_graph_ret_entry *entry; @@ -964,7 +989,8 @@ static void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; - ring_buffer_unlock_commit(global_trace.buffer, event); + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(global_trace.buffer, event); } #endif @@ -982,6 +1008,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, int skip, int pc) { #ifdef CONFIG_STACKTRACE + struct ftrace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; @@ -999,7 +1026,8 @@ static void __ftrace_trace_stack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace(&trace); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1024,6 +1052,7 @@ static void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc) { #ifdef CONFIG_STACKTRACE + struct ftrace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; struct stack_trace trace; @@ -1045,7 +1074,8 @@ static void ftrace_trace_userstack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace_user(&trace); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1089,6 +1119,7 @@ tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_context_switch; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -1104,7 +1135,9 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_prio = next->prio; entry->next_state = next->state; entry->next_cpu = task_cpu(next); - trace_buffer_unlock_commit(tr, event, flags, pc); + + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, flags, pc); } void @@ -1113,6 +1146,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *curr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -1129,7 +1163,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); ftrace_trace_stack(tr, flags, 6, pc); ftrace_trace_userstack(tr, flags, pc); } @@ -1230,11 +1265,13 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static u32 trace_buf[TRACE_BUF_SIZE]; + struct ftrace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; struct bprint_entry *entry; unsigned long flags; + int disable; int resched; int cpu, len = 0, size, pc; @@ -1249,7 +1286,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(atomic_read(&data->disabled))) + disable = atomic_inc_return(&data->disabled); + if (unlikely(disable != 1)) goto out; /* Lockdep uses trace_printk for lock tracing */ @@ -1269,13 +1307,15 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out_unlock: __raw_spin_unlock(&trace_buf_lock); local_irq_restore(flags); out: + atomic_dec_return(&data->disabled); ftrace_preempt_enable(resched); unpause_graph_tracing(); @@ -1288,12 +1328,14 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; + struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; int cpu, len = 0, size, pc; struct print_entry *entry; unsigned long irq_flags; + int disable; if (tracing_disabled || tracing_selftest_running) return 0; @@ -1303,7 +1345,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(atomic_read(&data->disabled))) + disable = atomic_inc_return(&data->disabled); + if (unlikely(disable != 1)) goto out; pause_graph_tracing(); @@ -1323,13 +1366,15 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out_unlock: __raw_spin_unlock(&trace_buf_lock); raw_local_irq_restore(irq_flags); unpause_graph_tracing(); out: + atomic_dec_return(&data->disabled); preempt_enable_notrace(); return len; @@ -1526,12 +1571,14 @@ static void *s_start(struct seq_file *m, loff_t *pos) p = s_next(m, p, &l); } + trace_event_read_lock(); return p; } static void s_stop(struct seq_file *m, void *p) { atomic_dec(&trace_record_cmdline_disabled); + trace_event_read_unlock(); } static void print_lat_help_header(struct seq_file *m) @@ -1774,6 +1821,7 @@ static int trace_empty(struct trace_iterator *iter) return 1; } +/* Called with trace_event_read_lock() held. */ static enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; @@ -2380,7 +2428,7 @@ static const char readme_msg[] = "# echo print-parent > /debug/tracing/trace_options\n" "# echo 1 > /debug/tracing/tracing_enabled\n" "# cat /debug/tracing/trace > /tmp/trace.txt\n" - "echo 0 > /debug/tracing/tracing_enabled\n" + "# echo 0 > /debug/tracing/tracing_enabled\n" ; static ssize_t @@ -2397,6 +2445,56 @@ static const struct file_operations tracing_readme_fops = { }; static ssize_t +tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf_comm; + char *file_buf; + char *buf; + int len = 0; + int pid; + int i; + + file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); + if (!file_buf) + return -ENOMEM; + + buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); + if (!buf_comm) { + kfree(file_buf); + return -ENOMEM; + } + + buf = file_buf; + + for (i = 0; i < SAVED_CMDLINES; i++) { + int r; + + pid = map_cmdline_to_pid[i]; + if (pid == -1 || pid == NO_CMDLINE_MAP) + continue; + + trace_find_cmdline(pid, buf_comm); + r = sprintf(buf, "%d %s\n", pid, buf_comm); + buf += r; + len += r; + } + + len = simple_read_from_buffer(ubuf, cnt, ppos, + file_buf, len); + + kfree(file_buf); + kfree(buf_comm); + + return len; +} + +static const struct file_operations tracing_saved_cmdlines_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_read, +}; + +static ssize_t tracing_ctrl_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -2728,6 +2826,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) /* trace pipe does not show start of buffer */ cpumask_setall(iter->started); + if (trace_flags & TRACE_ITER_LATENCY_FMT) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + iter->cpu_file = cpu_file; iter->tr = &global_trace; mutex_init(&iter->mutex); @@ -2915,6 +3016,7 @@ waitagain: offsetof(struct trace_iterator, seq)); iter->pos = -1; + trace_event_read_lock(); while (find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int len = iter->seq.len; @@ -2931,6 +3033,7 @@ waitagain: if (iter->seq.len >= cnt) break; } + trace_event_read_unlock(); /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); @@ -3053,6 +3156,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, goto out_err; } + trace_event_read_lock(); + /* Fill as many pages as possible. */ for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { pages[i] = alloc_page(GFP_KERNEL); @@ -3075,6 +3180,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_seq_init(&iter->seq); } + trace_event_read_unlock(); mutex_unlock(&iter->mutex); spd.nr_pages = i; @@ -3425,7 +3531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; - int size, i; + int entries, size, i; size_t ret; if (*ppos & (PAGE_SIZE - 1)) { @@ -3440,7 +3546,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } - for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) { + entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + + for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { struct page *page; int r; @@ -3457,7 +3565,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, } r = ring_buffer_read_page(ref->buffer, &ref->page, - len, info->cpu, 0); + len, info->cpu, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->page); @@ -3481,6 +3589,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, spd.partial[i].private = (unsigned long)ref; spd.nr_pages++; *ppos += PAGE_SIZE; + + entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); } spd.nr_pages = i; @@ -3508,6 +3618,45 @@ static const struct file_operations tracing_buffers_fops = { .llseek = no_llseek, }; +static ssize_t +tracing_stats_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned long cpu = (unsigned long)filp->private_data; + struct trace_array *tr = &global_trace; + struct trace_seq *s; + unsigned long cnt; + + s = kmalloc(sizeof(*s), GFP_ATOMIC); + if (!s) + return ENOMEM; + + trace_seq_init(s); + + cnt = ring_buffer_entries_cpu(tr->buffer, cpu); + trace_seq_printf(s, "entries: %ld\n", cnt); + + cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); + trace_seq_printf(s, "overrun: %ld\n", cnt); + + cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); + trace_seq_printf(s, "commit overrun: %ld\n", cnt); + + cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu); + trace_seq_printf(s, "nmi dropped: %ld\n", cnt); + + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + + kfree(s); + + return count; +} + +static const struct file_operations tracing_stats_fops = { + .open = tracing_open_generic, + .read = tracing_stats_read, +}; + #ifdef CONFIG_DYNAMIC_FTRACE int __weak ftrace_arch_read_dyn_info(char *buf, int size) @@ -3597,7 +3746,7 @@ struct dentry *tracing_dentry_percpu(void) static void tracing_init_debugfs_percpu(long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(); - struct dentry *entry, *d_cpu; + struct dentry *d_cpu; /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ char cpu_dir[7]; @@ -3612,21 +3761,18 @@ static void tracing_init_debugfs_percpu(long cpu) } /* per cpu trace_pipe */ - entry = debugfs_create_file("trace_pipe", 0444, d_cpu, - (void *) cpu, &tracing_pipe_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_pipe' entry\n"); + trace_create_file("trace_pipe", 0444, d_cpu, + (void *) cpu, &tracing_pipe_fops); /* per cpu trace */ - entry = debugfs_create_file("trace", 0644, d_cpu, - (void *) cpu, &tracing_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); + trace_create_file("trace", 0644, d_cpu, + (void *) cpu, &tracing_fops); + + trace_create_file("trace_pipe_raw", 0444, d_cpu, + (void *) cpu, &tracing_buffers_fops); - entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu, - (void *) cpu, &tracing_buffers_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n"); + trace_create_file("stats", 0444, d_cpu, + (void *) cpu, &tracing_stats_fops); } #ifdef CONFIG_FTRACE_SELFTEST @@ -3782,6 +3928,22 @@ static const struct file_operations trace_options_core_fops = { .write = trace_options_core_write, }; +struct dentry *trace_create_file(const char *name, + mode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops) +{ + struct dentry *ret; + + ret = debugfs_create_file(name, mode, parent, data, fops); + if (!ret) + pr_warning("Could not create debugfs '%s' entry\n", name); + + return ret; +} + + static struct dentry *trace_options_init_dentry(void) { struct dentry *d_tracer; @@ -3809,7 +3971,6 @@ create_trace_option_file(struct trace_option_dentry *topt, struct tracer_opt *opt) { struct dentry *t_options; - struct dentry *entry; t_options = trace_options_init_dentry(); if (!t_options) @@ -3818,11 +3979,9 @@ create_trace_option_file(struct trace_option_dentry *topt, topt->flags = flags; topt->opt = opt; - entry = debugfs_create_file(opt->name, 0644, t_options, topt, + topt->entry = trace_create_file(opt->name, 0644, t_options, topt, &trace_options_fops); - topt->entry = entry; - } static struct trace_option_dentry * @@ -3877,123 +4036,84 @@ static struct dentry * create_trace_option_core_file(const char *option, long index) { struct dentry *t_options; - struct dentry *entry; t_options = trace_options_init_dentry(); if (!t_options) return NULL; - entry = debugfs_create_file(option, 0644, t_options, (void *)index, + return trace_create_file(option, 0644, t_options, (void *)index, &trace_options_core_fops); - - return entry; } static __init void create_trace_options_dir(void) { struct dentry *t_options; - struct dentry *entry; int i; t_options = trace_options_init_dentry(); if (!t_options) return; - for (i = 0; trace_options[i]; i++) { - entry = create_trace_option_core_file(trace_options[i], i); - if (!entry) - pr_warning("Could not create debugfs %s entry\n", - trace_options[i]); - } + for (i = 0; trace_options[i]; i++) + create_trace_option_core_file(trace_options[i], i); } static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; int cpu; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, - &global_trace, &tracing_ctrl_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); + trace_create_file("tracing_enabled", 0644, d_tracer, + &global_trace, &tracing_ctrl_fops); - entry = debugfs_create_file("trace_options", 0644, d_tracer, - NULL, &tracing_iter_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_options' entry\n"); + trace_create_file("trace_options", 0644, d_tracer, + NULL, &tracing_iter_fops); - create_trace_options_dir(); + trace_create_file("tracing_cpumask", 0644, d_tracer, + NULL, &tracing_cpumask_fops); + + trace_create_file("trace", 0644, d_tracer, + (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); - entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, - NULL, &tracing_cpumask_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); - - entry = debugfs_create_file("trace", 0644, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); - - entry = debugfs_create_file("available_tracers", 0444, d_tracer, - &global_trace, &show_traces_fops); - if (!entry) - pr_warning("Could not create debugfs 'available_tracers' entry\n"); - - entry = debugfs_create_file("current_tracer", 0444, d_tracer, - &global_trace, &set_tracer_fops); - if (!entry) - pr_warning("Could not create debugfs 'current_tracer' entry\n"); - - entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, - &tracing_max_latency, - &tracing_max_lat_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'tracing_max_latency' entry\n"); - - entry = debugfs_create_file("tracing_thresh", 0644, d_tracer, - &tracing_thresh, &tracing_max_lat_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'tracing_thresh' entry\n"); - entry = debugfs_create_file("README", 0644, d_tracer, - NULL, &tracing_readme_fops); - if (!entry) - pr_warning("Could not create debugfs 'README' entry\n"); - - entry = debugfs_create_file("trace_pipe", 0444, d_tracer, + trace_create_file("available_tracers", 0444, d_tracer, + &global_trace, &show_traces_fops); + + trace_create_file("current_tracer", 0644, d_tracer, + &global_trace, &set_tracer_fops); + + trace_create_file("tracing_max_latency", 0644, d_tracer, + &tracing_max_latency, &tracing_max_lat_fops); + + trace_create_file("tracing_thresh", 0644, d_tracer, + &tracing_thresh, &tracing_max_lat_fops); + + trace_create_file("README", 0444, d_tracer, + NULL, &tracing_readme_fops); + + trace_create_file("trace_pipe", 0444, d_tracer, (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'trace_pipe' entry\n"); - - entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer, - &global_trace, &tracing_entries_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'buffer_size_kb' entry\n"); - - entry = debugfs_create_file("trace_marker", 0220, d_tracer, - NULL, &tracing_mark_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'trace_marker' entry\n"); + + trace_create_file("buffer_size_kb", 0644, d_tracer, + &global_trace, &tracing_entries_fops); + + trace_create_file("trace_marker", 0220, d_tracer, + NULL, &tracing_mark_fops); + + trace_create_file("saved_cmdlines", 0444, d_tracer, + NULL, &tracing_saved_cmdlines_fops); #ifdef CONFIG_DYNAMIC_FTRACE - entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, - &ftrace_update_tot_cnt, - &tracing_dyn_info_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'dyn_ftrace_total_info' entry\n"); + trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, + &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif #ifdef CONFIG_SYSPROF_TRACER init_tracer_sysprof_debugfs(d_tracer); #endif + create_trace_options_dir(); + for_each_tracing_cpu(cpu) tracing_init_debugfs_percpu(cpu); @@ -4064,7 +4184,8 @@ trace_printk_seq(struct trace_seq *s) static void __ftrace_dump(bool disable_tracing) { - static DEFINE_SPINLOCK(ftrace_dump_lock); + static raw_spinlock_t ftrace_dump_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; unsigned int old_userobj; @@ -4073,7 +4194,8 @@ static void __ftrace_dump(bool disable_tracing) int cnt = 0, cpu; /* only one dump */ - spin_lock_irqsave(&ftrace_dump_lock, flags); + local_irq_save(flags); + __raw_spin_lock(&ftrace_dump_lock); if (dump_ran) goto out; @@ -4145,7 +4267,8 @@ static void __ftrace_dump(bool disable_tracing) } out: - spin_unlock_irqrestore(&ftrace_dump_lock, flags); + __raw_spin_unlock(&ftrace_dump_lock); + local_irq_restore(flags); } /* By default: disable tracing after the dump */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e685ac2..6e735d4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -9,9 +9,12 @@ #include <linux/mmiotrace.h> #include <linux/ftrace.h> #include <trace/boot.h> -#include <trace/kmemtrace.h> +#include <linux/kmemtrace.h> #include <trace/power.h> +#include <linux/trace_seq.h> +#include <linux/ftrace_event.h> + enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -42,20 +45,6 @@ enum trace_type { }; /* - * The trace entry - the most basic unit of tracing. This is what - * is printed in the end as a single line in the trace output, such as: - * - * bash-15816 [01] 235.197585: idle_cpu <- irq_enter - */ -struct trace_entry { - unsigned char type; - unsigned char flags; - unsigned char preempt_count; - int pid; - int tgid; -}; - -/* * Function trace entry - function address and parent function addres: */ struct ftrace_entry { @@ -263,8 +252,6 @@ struct trace_array_cpu { char comm[TASK_COMM_LEN]; }; -struct trace_iterator; - /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -339,15 +326,6 @@ extern void __ftrace_bad_type(void); __ftrace_bad_type(); \ } while (0) -/* Return values for print_line callback */ -enum print_line_t { - TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ - TRACE_TYPE_HANDLED = 1, - TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ - TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ -}; - - /* * An option specific to a tracer. This is a boolean value. * The bit is the bit index that sets its value on the @@ -423,60 +401,30 @@ struct tracer { struct tracer_stat *stats; }; -struct trace_seq { - unsigned char buffer[PAGE_SIZE]; - unsigned int len; - unsigned int readpos; -}; - -static inline void -trace_seq_init(struct trace_seq *s) -{ - s->len = 0; - s->readpos = 0; -} - #define TRACE_PIPE_ALL_CPU -1 -/* - * Trace iterator - used by printout routines who present trace - * results to users and which routines might sleep, etc: - */ -struct trace_iterator { - struct trace_array *tr; - struct tracer *trace; - void *private; - int cpu_file; - struct mutex mutex; - struct ring_buffer_iter *buffer_iter[NR_CPUS]; - - /* The below is zeroed out in pipe_read */ - struct trace_seq seq; - struct trace_entry *ent; - int cpu; - u64 ts; - - unsigned long iter_flags; - loff_t pos; - long idx; - - cpumask_var_t started; -}; - int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); void tracing_reset_online_cpus(struct trace_array *tr); +void tracing_reset_current(int cpu); +void tracing_reset_current_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); +struct dentry *trace_create_file(const char *name, + mode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops); + struct dentry *tracing_init_dentry(void); void init_tracer_sysprof_debugfs(struct dentry *d_tracer); struct ring_buffer_event; struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - unsigned char type, + int type, unsigned long len, unsigned long flags, int pc); @@ -484,14 +432,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr, struct ring_buffer_event *event, unsigned long flags, int pc); -struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, - unsigned long flags, int pc); -void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, - unsigned long flags, int pc); -void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, - unsigned long flags, int pc); - struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); @@ -514,7 +454,6 @@ void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *prev, struct task_struct *next, unsigned long flags, int pc); -void tracing_record_cmdline(struct task_struct *tsk); void tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *wakee, @@ -599,6 +538,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); +extern int trace_selftest_startup_hw_branches(struct tracer *trace, + struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); @@ -613,6 +554,8 @@ extern unsigned long trace_flags; /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER extern enum print_line_t print_graph_function(struct trace_iterator *iter); +extern enum print_line_t +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ @@ -644,7 +587,6 @@ static inline int ftrace_graph_addr(unsigned long addr) return 1; } #endif /* CONFIG_DYNAMIC_FTRACE */ - #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t print_graph_function(struct trace_iterator *iter) @@ -692,6 +634,7 @@ enum trace_iterator_flags { TRACE_ITER_LATENCY_FMT = 0x40000, TRACE_ITER_GLOBAL_CLK = 0x80000, TRACE_ITER_SLEEP_TIME = 0x100000, + TRACE_ITER_GRAPH_TIME = 0x200000, }; /* @@ -790,103 +733,113 @@ struct ftrace_event_field { char *type; int offset; int size; + int is_signed; }; -struct ftrace_event_call { - char *name; - char *system; - struct dentry *dir; - int enabled; - int (*regfunc)(void); - void (*unregfunc)(void); - int id; - int (*raw_init)(void); - int (*show_format)(struct trace_seq *s); - int (*define_fields)(void); - struct list_head fields; +struct event_filter { + int n_preds; struct filter_pred **preds; - -#ifdef CONFIG_EVENT_PROFILE - atomic_t profile_count; - int (*profile_enable)(struct ftrace_event_call *); - void (*profile_disable)(struct ftrace_event_call *); -#endif + char *filter_string; }; struct event_subsystem { struct list_head list; const char *name; struct dentry *entry; - struct filter_pred **preds; + void *filter; }; -#define events_for_each(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) - -#define MAX_FILTER_PRED 8 - struct filter_pred; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, + int val1, int val2); struct filter_pred { filter_pred_fn_t fn; u64 val; - char *str_val; + char str_val[MAX_FILTER_STR_VAL]; int str_len; char *field_name; int offset; int not; - int or; - int compound; - int clear; + int op; + int pop_n; }; -int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size); -extern void filter_free_pred(struct filter_pred *pred); -extern void filter_print_preds(struct filter_pred **preds, +extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); -extern int filter_parse(char **pbuf, struct filter_pred *pred); -extern int filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred); -extern void filter_free_preds(struct ftrace_event_call *call); -extern int filter_match_preds(struct ftrace_event_call *call, void *rec); -extern void filter_free_subsystem_preds(struct event_subsystem *system); -extern int filter_add_subsystem_pred(struct event_subsystem *system, - struct filter_pred *pred); - -void event_trace_printk(unsigned long ip, const char *fmt, ...); -extern struct ftrace_event_call __start_ftrace_events[]; -extern struct ftrace_event_call __stop_ftrace_events[]; - -#define for_each_event(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) +extern int apply_event_filter(struct ftrace_event_call *call, + char *filter_string); +extern int apply_subsystem_event_filter(struct event_subsystem *system, + char *filter_string); +extern void print_subsystem_event_filter(struct event_subsystem *system, + struct trace_seq *s); + +static inline int +filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; +} + +#define DEFINE_COMPARISON_PRED(type) \ +static int filter_pred_##type(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = 0; \ + \ + switch (pred->op) { \ + case OP_LT: \ + match = (*addr < val); \ + break; \ + case OP_LE: \ + match = (*addr <= val); \ + break; \ + case OP_GT: \ + match = (*addr > val); \ + break; \ + case OP_GE: \ + match = (*addr >= val); \ + break; \ + default: \ + break; \ + } \ + \ + return match; \ +} + +#define DEFINE_EQUALITY_PRED(size) \ +static int filter_pred_##size(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + u##size val = (u##size)pred->val; \ + int match; \ + \ + match = (val == *addr) ^ pred->not; \ + \ + return match; \ +} + +extern struct mutex event_mutex; +extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; -/* - * The double __builtin_constant_p is because gcc will give us an error - * if we try to allocate the static variable to fmt if it is not a - * constant. Even with the outer if statement optimizing out. - */ -#define event_trace_printk(ip, fmt, args...) \ -do { \ - __trace_printk_check_format(fmt, ##args); \ - tracing_record_cmdline(current); \ - if (__builtin_constant_p(fmt)) { \ - static const char *trace_printk_fmt \ - __attribute__((section("__trace_printk_fmt"))) = \ - __builtin_constant_p(fmt) ? fmt : NULL; \ - \ - __trace_bprintk(ip, trace_printk_fmt, ##args); \ - } else \ - __trace_printk(ip, fmt, ##args); \ -} while (0) +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ + extern struct ftrace_event_call event_##call; +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) +#include "trace_event_types.h" #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 7a30fc4..a29ef23 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -9,6 +9,7 @@ #include <linux/debugfs.h> #include <linux/ftrace.h> #include <linux/kallsyms.h> +#include <linux/time.h> #include "trace.h" #include "trace_output.h" @@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter) trace_assign_type(field, entry); call = &field->boot_call; ts = iter->ts; - nsec_rem = do_div(ts, 1000000000); + nsec_rem = do_div(ts, NSEC_PER_SEC); ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", (unsigned long)ts, nsec_rem, call->func, call->caller); @@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter) trace_assign_type(field, entry); init_ret = &field->boot_ret; ts = iter->ts; - nsec_rem = do_div(ts, 1000000000); + nsec_rem = do_div(ts, NSEC_PER_SEC); ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " "returned %d after %llu msecs\n", diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 8333715..7a7a9fd 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -30,6 +30,7 @@ static struct trace_array *branch_tracer; static void probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) { + struct ftrace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; struct ring_buffer_event *event; struct trace_branch *entry; @@ -73,7 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); @@ -271,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v) return 0; } -static void *annotated_branch_stat_start(void) +static void *annotated_branch_stat_start(struct tracer_stat *trace) { return __start_annotated_branch_profile; } @@ -346,7 +348,7 @@ static int all_branch_stat_headers(struct seq_file *m) return 0; } -static void *all_branch_stat_start(void) +static void *all_branch_stat_start(struct tracer_stat *trace) { return __start_branch_profile; } diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 22cba99..5b5895a 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -10,22 +10,30 @@ int ftrace_profile_enable(int event_id) { struct ftrace_event_call *event; + int ret = -EINVAL; - for_each_event(event) { - if (event->id == event_id) - return event->profile_enable(event); + mutex_lock(&event_mutex); + list_for_each_entry(event, &ftrace_events, list) { + if (event->id == event_id) { + ret = event->profile_enable(event); + break; + } } + mutex_unlock(&event_mutex); - return -EINVAL; + return ret; } void ftrace_profile_disable(int event_id) { struct ftrace_event_call *event; - for_each_event(event) { - if (event->id == event_id) - return event->profile_disable(event); + mutex_lock(&event_mutex); + list_for_each_entry(event, &ftrace_events, list) { + if (event->id == event_id) { + event->profile_disable(event); + break; + } } + mutex_unlock(&event_mutex); } - diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index fd78bee..5e32e37 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -57,7 +57,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") ); -TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore, +TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, arg1, arg1) TRACE_FIELD(unsigned long, arg2, arg2) @@ -122,8 +122,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned int, line, line) - TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func) - TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file) + TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, + TRACE_FUNC_SIZE+1, func) + TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, + TRACE_FUNC_SIZE+1, file) TRACE_FIELD(char, correct, correct) ), TP_RAW_FMT("%u:%s:%s (%u)") @@ -139,8 +141,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, TRACE_STRUCT( - TRACE_FIELD(ktime_t, state_data.stamp, stamp) - TRACE_FIELD(ktime_t, state_data.end, end) + TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1) + TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1) TRACE_FIELD(int, state_data.type, type) TRACE_FIELD(int, state_data.state, state) ), diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 576f4fa..aa08be6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -8,19 +8,25 @@ * */ +#include <linux/workqueue.h> +#include <linux/spinlock.h> +#include <linux/kthread.h> #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> +#include <linux/delay.h> #include "trace_output.h" #define TRACE_SYSTEM "TRACE_SYSTEM" -static DEFINE_MUTEX(event_mutex); +DEFINE_MUTEX(event_mutex); + +LIST_HEAD(ftrace_events); int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size) + char *name, int offset, int size, int is_signed) { struct ftrace_event_field *field; @@ -38,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type, field->offset = offset; field->size = size; + field->is_signed = is_signed; list_add(&field->link, &call->fields); return 0; @@ -51,47 +58,94 @@ err: return -ENOMEM; } +EXPORT_SYMBOL_GPL(trace_define_field); -static void ftrace_clear_events(void) -{ - struct ftrace_event_call *call = (void *)__start_ftrace_events; - +#ifdef CONFIG_MODULES - while ((unsigned long)call < (unsigned long)__stop_ftrace_events) { +static void trace_destroy_fields(struct ftrace_event_call *call) +{ + struct ftrace_event_field *field, *next; - if (call->enabled) { - call->enabled = 0; - call->unregfunc(); - } - call++; + list_for_each_entry_safe(field, next, &call->fields, link) { + list_del(&field->link); + kfree(field->type); + kfree(field->name); + kfree(field); } } +#endif /* CONFIG_MODULES */ + static void ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { - switch (enable) { case 0: if (call->enabled) { call->enabled = 0; + tracing_stop_cmdline_record(); call->unregfunc(); } break; case 1: if (!call->enabled) { call->enabled = 1; + tracing_start_cmdline_record(); call->regfunc(); } break; } } +static void ftrace_clear_events(void) +{ + struct ftrace_event_call *call; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + ftrace_event_enable_disable(call, 0); + } + mutex_unlock(&event_mutex); +} + +/* + * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. + */ +static int __ftrace_set_clr_event(const char *match, const char *sub, + const char *event, int set) +{ + struct ftrace_event_call *call; + int ret = -EINVAL; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + + if (!call->name || !call->regfunc) + continue; + + if (match && + strcmp(match, call->name) != 0 && + strcmp(match, call->system) != 0) + continue; + + if (sub && strcmp(sub, call->system) != 0) + continue; + + if (event && strcmp(event, call->name) != 0) + continue; + + ftrace_event_enable_disable(call, set); + + ret = 0; + } + mutex_unlock(&event_mutex); + + return ret; +} + static int ftrace_set_clr_event(char *buf, int set) { - struct ftrace_event_call *call = __start_ftrace_events; char *event = NULL, *sub = NULL, *match; - int ret = -EINVAL; /* * The buf format can be <subsystem>:<event-name> @@ -117,30 +171,24 @@ static int ftrace_set_clr_event(char *buf, int set) event = NULL; } - mutex_lock(&event_mutex); - for_each_event(call) { - - if (!call->name || !call->regfunc) - continue; - - if (match && - strcmp(match, call->name) != 0 && - strcmp(match, call->system) != 0) - continue; - - if (sub && strcmp(sub, call->system) != 0) - continue; - - if (event && strcmp(event, call->name) != 0) - continue; - - ftrace_event_enable_disable(call, set); - - ret = 0; - } - mutex_unlock(&event_mutex); + return __ftrace_set_clr_event(match, sub, event, set); +} - return ret; +/** + * trace_set_clr_event - enable or disable an event + * @system: system name to match (NULL for any system) + * @event: event name to match (NULL for all events, within system) + * @set: 1 to enable, 0 to disable + * + * This is a way for other parts of the kernel to enable or disable + * event recording. + * + * Returns 0 on success, -EINVAL if the parameters do not match any + * registered events. + */ +int trace_set_clr_event(const char *system, const char *event, int set) +{ + return __ftrace_set_clr_event(NULL, system, event, set); } /* 128 should be much more than enough */ @@ -224,15 +272,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = m->private; - struct ftrace_event_call *next = call; + struct list_head *list = m->private; + struct ftrace_event_call *call; (*pos)++; for (;;) { - if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + if (list == &ftrace_events) return NULL; + call = list_entry(list, struct ftrace_event_call, list); + /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. @@ -240,45 +290,51 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (call->regfunc) break; - call++; - next = call; + list = list->next; } - m->private = ++next; + m->private = list->next; return call; } static void *t_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&event_mutex); + if (*pos == 0) + m->private = ftrace_events.next; return t_next(m, NULL, pos); } static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = m->private; - struct ftrace_event_call *next; + struct list_head *list = m->private; + struct ftrace_event_call *call; (*pos)++; retry: - if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + if (list == &ftrace_events) return NULL; + call = list_entry(list, struct ftrace_event_call, list); + if (!call->enabled) { - call++; + list = list->next; goto retry; } - next = call; - m->private = ++next; + m->private = list->next; return call; } static void *s_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&event_mutex); + if (*pos == 0) + m->private = ftrace_events.next; return s_next(m, NULL, pos); } @@ -295,12 +351,12 @@ static int t_show(struct seq_file *m, void *v) static void t_stop(struct seq_file *m, void *p) { + mutex_unlock(&event_mutex); } static int ftrace_event_seq_open(struct inode *inode, struct file *file) { - int ret; const struct seq_operations *seq_ops; if ((file->f_mode & FMODE_WRITE) && @@ -308,13 +364,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file) ftrace_clear_events(); seq_ops = inode->i_private; - ret = seq_open(file, seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = __start_ftrace_events; - } - return ret; + return seq_open(file, seq_ops); } static ssize_t @@ -374,8 +424,93 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static ssize_t +system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char set_to_char[4] = { '?', '0', '1', 'X' }; + const char *system = filp->private_data; + struct ftrace_event_call *call; + char buf[2]; + int set = 0; + int ret; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + if (!call->name || !call->regfunc) + continue; + + if (system && strcmp(call->system, system) != 0) + continue; + + /* + * We need to find out if all the events are set + * or if all events or cleared, or if we have + * a mixture. + */ + set |= (1 << !!call->enabled); + + /* + * If we have a mixture, no need to look further. + */ + if (set == 3) + break; + } + mutex_unlock(&event_mutex); + + buf[0] = set_to_char[set]; + buf[1] = '\n'; + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); + + return ret; +} + +static ssize_t +system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char *system = filp->private_data; + unsigned long val; + char buf[64]; + ssize_t ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + if (val != 0 && val != 1) + return -EINVAL; + + ret = __ftrace_set_clr_event(NULL, system, NULL, val); + if (ret) + goto out; + + ret = cnt; + +out: + *ppos += cnt; + + return ret; +} + +extern char *__bad_type_size(void); + #undef FIELD #define FIELD(type, name) \ + sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ #type, "common_" #name, offsetof(typeof(field), name), \ sizeof(field.name) @@ -391,7 +526,7 @@ static int trace_write_header(struct trace_seq *s) "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\n", - FIELD(unsigned char, type), + FIELD(unsigned short, type), FIELD(unsigned char, flags), FIELD(unsigned char, preempt_count), FIELD(int, pid), @@ -481,7 +616,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(call->preds, s); + print_event_filter(call, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -494,38 +629,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_call *call = filp->private_data; - char buf[64], *pbuf = buf; - struct filter_pred *pred; + char *buf; int err; - if (cnt >= sizeof(buf)) + if (cnt >= PAGE_SIZE) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - buf[cnt] = '\0'; - - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) return -ENOMEM; - err = filter_parse(&pbuf, pred); - if (err < 0) { - filter_free_pred(pred); - return err; - } - - if (pred->clear) { - filter_free_preds(call); - filter_free_pred(pred); - return cnt; + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long) buf); + return -EFAULT; } + buf[cnt] = '\0'; - err = filter_add_pred(call, pred); - if (err < 0) { - filter_free_pred(pred); + err = apply_event_filter(call, buf); + free_page((unsigned long) buf); + if (err < 0) return err; - } *ppos += cnt; @@ -549,7 +672,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(system->preds, s); + print_subsystem_event_filter(system, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -562,45 +685,56 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct event_subsystem *system = filp->private_data; - char buf[64], *pbuf = buf; - struct filter_pred *pred; + char *buf; int err; - if (cnt >= sizeof(buf)) + if (cnt >= PAGE_SIZE) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - buf[cnt] = '\0'; - - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) return -ENOMEM; - err = filter_parse(&pbuf, pred); - if (err < 0) { - filter_free_pred(pred); - return err; - } - - if (pred->clear) { - filter_free_subsystem_preds(system); - filter_free_pred(pred); - return cnt; + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long) buf); + return -EFAULT; } + buf[cnt] = '\0'; - err = filter_add_subsystem_pred(system, pred); - if (err < 0) { - filter_free_subsystem_preds(system); - filter_free_pred(pred); + err = apply_subsystem_event_filter(system, buf); + free_page((unsigned long) buf); + if (err < 0) return err; - } *ppos += cnt; return cnt; } +static ssize_t +show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + int (*func)(struct trace_seq *s) = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + func(s); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + + kfree(s); + + return r; +} + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -658,6 +792,17 @@ static const struct file_operations ftrace_subsystem_filter_fops = { .write = subsystem_filter_write, }; +static const struct file_operations ftrace_system_enable_fops = { + .open = tracing_open_generic, + .read = system_enable_read, + .write = system_enable_write, +}; + +static const struct file_operations ftrace_show_header_fops = { + .open = tracing_open_generic, + .read = show_header, +}; + static struct dentry *event_trace_events_dir(void) { static struct dentry *d_tracer; @@ -684,6 +829,7 @@ static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { struct event_subsystem *system; + struct dentry *entry; /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { @@ -707,16 +853,46 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return d_events; } - system->name = name; + system->name = kstrdup(name, GFP_KERNEL); + if (!system->name) { + debugfs_remove(system->entry); + kfree(system); + return d_events; + } + list_add(&system->list, &event_subsystems); - system->preds = NULL; + system->filter = NULL; + + system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); + if (!system->filter) { + pr_warning("Could not allocate filter for subsystem " + "'%s'\n", name); + return system->entry; + } + + entry = debugfs_create_file("filter", 0644, system->entry, system, + &ftrace_subsystem_filter_fops); + if (!entry) { + kfree(system->filter); + system->filter = NULL; + pr_warning("Could not create debugfs " + "'%s/filter' entry\n", name); + } + + entry = trace_create_file("enable", 0644, system->entry, + (void *)system->name, + &ftrace_system_enable_fops); return system->entry; } static int -event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) +event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, + const struct file_operations *id, + const struct file_operations *enable, + const struct file_operations *filter, + const struct file_operations *format) { struct dentry *entry; int ret; @@ -725,7 +901,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". */ - if (strcmp(call->system, "TRACE_SYSTEM") != 0) + if (strcmp(call->system, TRACE_SYSTEM) != 0) d_events = event_subsystem_dir(call->system, d_events); if (call->raw_init) { @@ -744,21 +920,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return -1; } - if (call->regfunc) { - entry = debugfs_create_file("enable", 0644, call->dir, call, - &ftrace_enable_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/enable' entry\n", call->name); - } + if (call->regfunc) + entry = trace_create_file("enable", 0644, call->dir, call, + enable); - if (call->id) { - entry = debugfs_create_file("id", 0444, call->dir, call, - &ftrace_event_id_fops); - if (!entry) - pr_warning("Could not create debugfs '%s/id' entry\n", - call->name); - } + if (call->id) + entry = trace_create_file("id", 0444, call->dir, call, + id); if (call->define_fields) { ret = call->define_fields(); @@ -767,32 +935,195 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) " events/%s\n", call->name); return ret; } - entry = debugfs_create_file("filter", 0644, call->dir, call, - &ftrace_event_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/filter' entry\n", call->name); + entry = trace_create_file("filter", 0644, call->dir, call, + filter); } /* A trace may not want to export its format */ if (!call->show_format) return 0; - entry = debugfs_create_file("format", 0444, call->dir, call, - &ftrace_event_format_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/format' entry\n", call->name); + entry = trace_create_file("format", 0444, call->dir, call, + format); + + return 0; +} + +#define for_each_event(event, start, end) \ + for (event = start; \ + (unsigned long)event < (unsigned long)end; \ + event++) + +#ifdef CONFIG_MODULES + +static LIST_HEAD(ftrace_module_file_list); + +/* + * Modules must own their file_operations to keep up with + * reference counting. + */ +struct ftrace_module_file_ops { + struct list_head list; + struct module *mod; + struct file_operations id; + struct file_operations enable; + struct file_operations format; + struct file_operations filter; +}; + +static struct ftrace_module_file_ops * +trace_create_file_ops(struct module *mod) +{ + struct ftrace_module_file_ops *file_ops; + + /* + * This is a bit of a PITA. To allow for correct reference + * counting, modules must "own" their file_operations. + * To do this, we allocate the file operations that will be + * used in the event directory. + */ + + file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL); + if (!file_ops) + return NULL; + + file_ops->mod = mod; + + file_ops->id = ftrace_event_id_fops; + file_ops->id.owner = mod; + + file_ops->enable = ftrace_enable_fops; + file_ops->enable.owner = mod; + + file_ops->filter = ftrace_event_filter_fops; + file_ops->filter.owner = mod; + + file_ops->format = ftrace_event_format_fops; + file_ops->format.owner = mod; + + list_add(&file_ops->list, &ftrace_module_file_list); + + return file_ops; +} + +static void trace_module_add_events(struct module *mod) +{ + struct ftrace_module_file_ops *file_ops = NULL; + struct ftrace_event_call *call, *start, *end; + struct dentry *d_events; + + start = mod->trace_events; + end = mod->trace_events + mod->num_trace_events; + + if (start == end) + return; + + d_events = event_trace_events_dir(); + if (!d_events) + return; + + for_each_event(call, start, end) { + /* The linker may leave blanks */ + if (!call->name) + continue; + + /* + * This module has events, create file ops for this module + * if not already done. + */ + if (!file_ops) { + file_ops = trace_create_file_ops(mod); + if (!file_ops) + return; + } + call->mod = mod; + list_add(&call->list, &ftrace_events); + event_create_dir(call, d_events, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); + } +} + +static void trace_module_remove_events(struct module *mod) +{ + struct ftrace_module_file_ops *file_ops; + struct ftrace_event_call *call, *p; + bool found = false; + + down_write(&trace_event_mutex); + list_for_each_entry_safe(call, p, &ftrace_events, list) { + if (call->mod == mod) { + found = true; + ftrace_event_enable_disable(call, 0); + if (call->event) + __unregister_ftrace_event(call->event); + debugfs_remove_recursive(call->dir); + list_del(&call->list); + trace_destroy_fields(call); + destroy_preds(call); + } + } + + /* Now free the file_operations */ + list_for_each_entry(file_ops, &ftrace_module_file_list, list) { + if (file_ops->mod == mod) + break; + } + if (&file_ops->list != &ftrace_module_file_list) { + list_del(&file_ops->list); + kfree(file_ops); + } + + /* + * It is safest to reset the ring buffer if the module being unloaded + * registered any events. + */ + if (found) + tracing_reset_current_online_cpus(); + up_write(&trace_event_mutex); +} + +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + mutex_lock(&event_mutex); + switch (val) { + case MODULE_STATE_COMING: + trace_module_add_events(mod); + break; + case MODULE_STATE_GOING: + trace_module_remove_events(mod); + break; + } + mutex_unlock(&event_mutex); return 0; } +#else +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + +struct notifier_block trace_module_nb = { + .notifier_call = trace_module_notify, + .priority = 0, +}; + +extern struct ftrace_event_call __start_ftrace_events[]; +extern struct ftrace_event_call __stop_ftrace_events[]; static __init int event_trace_init(void) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; struct dentry *d_tracer; struct dentry *entry; struct dentry *d_events; + int ret; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -816,13 +1147,243 @@ static __init int event_trace_init(void) if (!d_events) return 0; - for_each_event(call) { + /* ring buffer internal formats */ + trace_create_file("header_page", 0444, d_events, + ring_buffer_print_page_header, + &ftrace_show_header_fops); + + trace_create_file("header_event", 0444, d_events, + ring_buffer_print_entry_header, + &ftrace_show_header_fops); + + trace_create_file("enable", 0644, d_events, + NULL, &ftrace_system_enable_fops); + + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ if (!call->name) continue; - event_create_dir(call, d_events); + list_add(&call->list, &ftrace_events); + event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, &ftrace_event_filter_fops, + &ftrace_event_format_fops); } + ret = register_module_notifier(&trace_module_nb); + if (ret) + pr_warning("Failed to register trace events module notifier\n"); + return 0; } fs_initcall(event_trace_init); + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static DEFINE_SPINLOCK(test_spinlock); +static DEFINE_SPINLOCK(test_spinlock_irq); +static DEFINE_MUTEX(test_mutex); + +static __init void test_work(struct work_struct *dummy) +{ + spin_lock(&test_spinlock); + spin_lock_irq(&test_spinlock_irq); + udelay(1); + spin_unlock_irq(&test_spinlock_irq); + spin_unlock(&test_spinlock); + + mutex_lock(&test_mutex); + msleep(1); + mutex_unlock(&test_mutex); +} + +static __init int event_test_thread(void *unused) +{ + void *test_malloc; + + test_malloc = kmalloc(1234, GFP_KERNEL); + if (!test_malloc) + pr_info("failed to kmalloc\n"); + + schedule_on_each_cpu(test_work); + + kfree(test_malloc); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) + schedule(); + + return 0; +} + +/* + * Do various things that may trigger events. + */ +static __init void event_test_stuff(void) +{ + struct task_struct *test_thread; + + test_thread = kthread_run(event_test_thread, NULL, "test-events"); + msleep(1); + kthread_stop(test_thread); +} + +/* + * For every trace event defined, we will test each trace point separately, + * and then by groups, and finally all trace points. + */ +static __init void event_trace_self_tests(void) +{ + struct ftrace_event_call *call; + struct event_subsystem *system; + int ret; + + pr_info("Running tests on trace events:\n"); + + list_for_each_entry(call, &ftrace_events, list) { + + /* Only test those that have a regfunc */ + if (!call->regfunc) + continue; + + pr_info("Testing event %s: ", call->name); + + /* + * If an event is already enabled, someone is using + * it and the self test should not be on. + */ + if (call->enabled) { + pr_warning("Enabled event during self test!\n"); + WARN_ON_ONCE(1); + continue; + } + + ftrace_event_enable_disable(call, 1); + event_test_stuff(); + ftrace_event_enable_disable(call, 0); + + pr_cont("OK\n"); + } + + /* Now test at the sub system level */ + + pr_info("Running tests on trace event systems:\n"); + + list_for_each_entry(system, &event_subsystems, list) { + + /* the ftrace system is special, skip it */ + if (strcmp(system->name, "ftrace") == 0) + continue; + + pr_info("Testing event system %s: ", system->name); + + ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); + if (WARN_ON_ONCE(ret)) { + pr_warning("error enabling system %s\n", + system->name); + continue; + } + + event_test_stuff(); + + ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); + if (WARN_ON_ONCE(ret)) + pr_warning("error disabling system %s\n", + system->name); + + pr_cont("OK\n"); + } + + /* Test with all events enabled */ + + pr_info("Running tests on all trace events:\n"); + pr_info("Testing all events: "); + + ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); + if (WARN_ON_ONCE(ret)) { + pr_warning("error enabling all events\n"); + return; + } + + event_test_stuff(); + + /* reset sysname */ + ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); + if (WARN_ON_ONCE(ret)) { + pr_warning("error disabling all events\n"); + return; + } + + pr_cont("OK\n"); +} + +#ifdef CONFIG_FUNCTION_TRACER + +static DEFINE_PER_CPU(atomic_t, test_event_disable); + +static void +function_test_events_call(unsigned long ip, unsigned long parent_ip) +{ + struct ring_buffer_event *event; + struct ftrace_entry *entry; + unsigned long flags; + long disabled; + int resched; + int cpu; + int pc; + + pc = preempt_count(); + resched = ftrace_preempt_disable(); + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + + if (disabled != 1) + goto out; + + local_save_flags(flags); + + event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), + flags, pc); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->parent_ip = parent_ip; + + trace_nowake_buffer_unlock_commit(event, flags, pc); + + out: + atomic_dec(&per_cpu(test_event_disable, cpu)); + ftrace_preempt_enable(resched); +} + +static struct ftrace_ops trace_ops __initdata = +{ + .func = function_test_events_call, +}; + +static __init void event_trace_self_test_with_function(void) +{ + register_ftrace_function(&trace_ops); + pr_info("Running tests again, along with the function tracer\n"); + event_trace_self_tests(); + unregister_ftrace_function(&trace_ops); +} +#else +static __init void event_trace_self_test_with_function(void) +{ +} +#endif + +static __init int event_trace_self_tests_init(void) +{ + + event_trace_self_tests(); + + event_trace_self_test_with_function(); + + return 0; +} + +late_initcall(event_trace_self_tests_init); + +#endif diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e03cbf1..db6e54b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -22,119 +22,297 @@ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> +#include <linux/mutex.h> #include "trace.h" #include "trace_output.h" -static int filter_pred_64(struct filter_pred *pred, void *event) +static DEFINE_MUTEX(filter_mutex); + +enum filter_op_ids +{ + OP_OR, + OP_AND, + OP_NE, + OP_EQ, + OP_LT, + OP_LE, + OP_GT, + OP_GE, + OP_NONE, + OP_OPEN_PAREN, +}; + +struct filter_op { + int id; + char *string; + int precedence; +}; + +static struct filter_op filter_ops[] = { + { OP_OR, "||", 1 }, + { OP_AND, "&&", 2 }, + { OP_NE, "!=", 4 }, + { OP_EQ, "==", 4 }, + { OP_LT, "<", 5 }, + { OP_LE, "<=", 5 }, + { OP_GT, ">", 5 }, + { OP_GE, ">=", 5 }, + { OP_NONE, "OP_NONE", 0 }, + { OP_OPEN_PAREN, "(", 0 }, +}; + +enum { + FILT_ERR_NONE, + FILT_ERR_INVALID_OP, + FILT_ERR_UNBALANCED_PAREN, + FILT_ERR_TOO_MANY_OPERANDS, + FILT_ERR_OPERAND_TOO_LONG, + FILT_ERR_FIELD_NOT_FOUND, + FILT_ERR_ILLEGAL_FIELD_OP, + FILT_ERR_ILLEGAL_INTVAL, + FILT_ERR_BAD_SUBSYS_FILTER, + FILT_ERR_TOO_MANY_PREDS, + FILT_ERR_MISSING_FIELD, + FILT_ERR_INVALID_FILTER, +}; + +static char *err_text[] = { + "No error", + "Invalid operator", + "Unbalanced parens", + "Too many operands", + "Operand too long", + "Field not found", + "Illegal operation for field type", + "Illegal integer value", + "Couldn't find or set field in one of a subsystem's events", + "Too many terms in predicate expression", + "Missing field name and/or value", + "Meaningless filter expression", +}; + +struct opstack_op { + int op; + struct list_head list; +}; + +struct postfix_elt { + int op; + char *operand; + struct list_head list; +}; + +struct filter_parse_state { + struct filter_op *ops; + struct list_head opstack; + struct list_head postfix; + int lasterr; + int lasterr_pos; + + struct { + char *string; + unsigned int cnt; + unsigned int tail; + } infix; + + struct { + char string[MAX_FILTER_STR_VAL]; + int pos; + unsigned int tail; + } operand; +}; + +DEFINE_COMPARISON_PRED(s64); +DEFINE_COMPARISON_PRED(u64); +DEFINE_COMPARISON_PRED(s32); +DEFINE_COMPARISON_PRED(u32); +DEFINE_COMPARISON_PRED(s16); +DEFINE_COMPARISON_PRED(u16); +DEFINE_COMPARISON_PRED(s8); +DEFINE_COMPARISON_PRED(u8); + +DEFINE_EQUALITY_PRED(64); +DEFINE_EQUALITY_PRED(32); +DEFINE_EQUALITY_PRED(16); +DEFINE_EQUALITY_PRED(8); + +static int filter_pred_and(struct filter_pred *pred __attribute((unused)), + void *event __attribute((unused)), + int val1, int val2) +{ + return val1 && val2; +} + +static int filter_pred_or(struct filter_pred *pred __attribute((unused)), + void *event __attribute((unused)), + int val1, int val2) +{ + return val1 || val2; +} + +/* Filter predicate for fixed sized arrays of characters */ +static int filter_pred_string(struct filter_pred *pred, void *event, + int val1, int val2) { - u64 *addr = (u64 *)(event + pred->offset); - u64 val = (u64)pred->val; - int match; + char *addr = (char *)(event + pred->offset); + int cmp, match; + + cmp = strncmp(addr, pred->str_val, pred->str_len); - match = (val == *addr) ^ pred->not; + match = (!cmp) ^ pred->not; return match; } -static int filter_pred_32(struct filter_pred *pred, void *event) +/* + * Filter predicate for dynamic sized arrays of characters. + * These are implemented through a list of strings at the end + * of the entry. + * Also each of these strings have a field in the entry which + * contains its offset from the beginning of the entry. + * We have then first to get this field, dereference it + * and add it to the address of the entry, and at last we have + * the address of the string. + */ +static int filter_pred_strloc(struct filter_pred *pred, void *event, + int val1, int val2) { - u32 *addr = (u32 *)(event + pred->offset); - u32 val = (u32)pred->val; - int match; + int str_loc = *(int *)(event + pred->offset); + char *addr = (char *)(event + str_loc); + int cmp, match; + + cmp = strncmp(addr, pred->str_val, pred->str_len); - match = (val == *addr) ^ pred->not; + match = (!cmp) ^ pred->not; return match; } -static int filter_pred_16(struct filter_pred *pred, void *event) +static int filter_pred_none(struct filter_pred *pred, void *event, + int val1, int val2) +{ + return 0; +} + +/* return 1 if event matches, 0 otherwise (discard) */ +int filter_match_preds(struct ftrace_event_call *call, void *rec) { - u16 *addr = (u16 *)(event + pred->offset); - u16 val = (u16)pred->val; - int match; + struct event_filter *filter = call->filter; + int match, top = 0, val1 = 0, val2 = 0; + int stack[MAX_FILTER_PRED]; + struct filter_pred *pred; + int i; + + for (i = 0; i < filter->n_preds; i++) { + pred = filter->preds[i]; + if (!pred->pop_n) { + match = pred->fn(pred, rec, val1, val2); + stack[top++] = match; + continue; + } + if (pred->pop_n > top) { + WARN_ON_ONCE(1); + return 0; + } + val1 = stack[--top]; + val2 = stack[--top]; + match = pred->fn(pred, rec, val1, val2); + stack[top++] = match; + } - match = (val == *addr) ^ pred->not; + return stack[--top]; +} +EXPORT_SYMBOL_GPL(filter_match_preds); - return match; +static void parse_error(struct filter_parse_state *ps, int err, int pos) +{ + ps->lasterr = err; + ps->lasterr_pos = pos; } -static int filter_pred_8(struct filter_pred *pred, void *event) +static void remove_filter_string(struct event_filter *filter) { - u8 *addr = (u8 *)(event + pred->offset); - u8 val = (u8)pred->val; - int match; + kfree(filter->filter_string); + filter->filter_string = NULL; +} - match = (val == *addr) ^ pred->not; +static int replace_filter_string(struct event_filter *filter, + char *filter_string) +{ + kfree(filter->filter_string); + filter->filter_string = kstrdup(filter_string, GFP_KERNEL); + if (!filter->filter_string) + return -ENOMEM; - return match; + return 0; } -static int filter_pred_string(struct filter_pred *pred, void *event) +static int append_filter_string(struct event_filter *filter, + char *string) { - char *addr = (char *)(event + pred->offset); - int cmp, match; + int newlen; + char *new_filter_string; - cmp = strncmp(addr, pred->str_val, pred->str_len); + BUG_ON(!filter->filter_string); + newlen = strlen(filter->filter_string) + strlen(string) + 1; + new_filter_string = kmalloc(newlen, GFP_KERNEL); + if (!new_filter_string) + return -ENOMEM; - match = (!cmp) ^ pred->not; + strcpy(new_filter_string, filter->filter_string); + strcat(new_filter_string, string); + kfree(filter->filter_string); + filter->filter_string = new_filter_string; - return match; + return 0; } -/* return 1 if event matches, 0 otherwise (discard) */ -int filter_match_preds(struct ftrace_event_call *call, void *rec) +static void append_filter_err(struct filter_parse_state *ps, + struct event_filter *filter) { - int i, matched, and_failed = 0; - struct filter_pred *pred; + int pos = ps->lasterr_pos; + char *buf, *pbuf; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (call->preds[i]) { - pred = call->preds[i]; - if (and_failed && !pred->or) - continue; - matched = pred->fn(pred, rec); - if (!matched && !pred->or) { - and_failed = 1; - continue; - } else if (matched && pred->or) - return 1; - } else - break; - } + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) + return; - if (and_failed) - return 0; + append_filter_string(filter, "\n"); + memset(buf, ' ', PAGE_SIZE); + if (pos > PAGE_SIZE - 128) + pos = 0; + buf[pos] = '^'; + pbuf = &buf[pos] + 1; - return 1; + sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]); + append_filter_string(filter, buf); + free_page((unsigned long) buf); } -void filter_print_preds(struct filter_pred **preds, struct trace_seq *s) +void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { - char *field_name; - struct filter_pred *pred; - int i; + struct event_filter *filter = call->filter; - if (!preds) { + mutex_lock(&filter_mutex); + if (filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else trace_seq_printf(s, "none\n"); - return; - } + mutex_unlock(&filter_mutex); +} - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (preds[i]) { - pred = preds[i]; - field_name = pred->field_name; - if (i) - trace_seq_printf(s, pred->or ? "|| " : "&& "); - trace_seq_printf(s, "%s ", field_name); - trace_seq_printf(s, pred->not ? "!= " : "== "); - if (pred->str_val) - trace_seq_printf(s, "%s\n", pred->str_val); - else - trace_seq_printf(s, "%llu\n", pred->val); - } else - break; - } +void print_subsystem_event_filter(struct event_subsystem *system, + struct trace_seq *s) +{ + struct event_filter *filter = system->filter; + + mutex_lock(&filter_mutex); + if (filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else + trace_seq_printf(s, "none\n"); + mutex_unlock(&filter_mutex); } static struct ftrace_event_field * @@ -150,284 +328,828 @@ find_event_field(struct ftrace_event_call *call, char *name) return NULL; } -void filter_free_pred(struct filter_pred *pred) +static void filter_free_pred(struct filter_pred *pred) { if (!pred) return; kfree(pred->field_name); - kfree(pred->str_val); kfree(pred); } -void filter_free_preds(struct ftrace_event_call *call) +static void filter_clear_pred(struct filter_pred *pred) { - int i; + kfree(pred->field_name); + pred->field_name = NULL; + pred->str_len = 0; +} - if (call->preds) { - for (i = 0; i < MAX_FILTER_PRED; i++) - filter_free_pred(call->preds[i]); - kfree(call->preds); - call->preds = NULL; +static int filter_set_pred(struct filter_pred *dest, + struct filter_pred *src, + filter_pred_fn_t fn) +{ + *dest = *src; + if (src->field_name) { + dest->field_name = kstrdup(src->field_name, GFP_KERNEL); + if (!dest->field_name) + return -ENOMEM; } + dest->fn = fn; + + return 0; } -void filter_free_subsystem_preds(struct event_subsystem *system) +static void filter_disable_preds(struct ftrace_event_call *call) { - struct ftrace_event_call *call = __start_ftrace_events; + struct event_filter *filter = call->filter; int i; - if (system->preds) { - for (i = 0; i < MAX_FILTER_PRED; i++) - filter_free_pred(system->preds[i]); - kfree(system->preds); - system->preds = NULL; - } + call->filter_active = 0; + filter->n_preds = 0; - events_for_each(call) { - if (!call->name || !call->regfunc) - continue; + for (i = 0; i < MAX_FILTER_PRED; i++) + filter->preds[i]->fn = filter_pred_none; +} + +void destroy_preds(struct ftrace_event_call *call) +{ + struct event_filter *filter = call->filter; + int i; - if (!strcmp(call->system, system->name)) - filter_free_preds(call); + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (filter->preds[i]) + filter_free_pred(filter->preds[i]); } + kfree(filter->preds); + kfree(filter); + call->filter = NULL; } -static int __filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred) +int init_preds(struct ftrace_event_call *call) { + struct event_filter *filter; + struct filter_pred *pred; int i; - if (call->preds && !pred->compound) - filter_free_preds(call); + filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!call->filter) + return -ENOMEM; - if (!call->preds) { - call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), - GFP_KERNEL); - if (!call->preds) - return -ENOMEM; - } + call->filter_active = 0; + filter->n_preds = 0; + + filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); + if (!filter->preds) + goto oom; for (i = 0; i < MAX_FILTER_PRED; i++) { - if (!call->preds[i]) { - call->preds[i] = pred; - return 0; + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + goto oom; + pred->fn = filter_pred_none; + filter->preds[i] = pred; + } + + return 0; + +oom: + destroy_preds(call); + + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(init_preds); + +static void filter_free_subsystem_preds(struct event_subsystem *system) +{ + struct event_filter *filter = system->filter; + struct ftrace_event_call *call; + int i; + + if (filter->n_preds) { + for (i = 0; i < filter->n_preds; i++) + filter_free_pred(filter->preds[i]); + kfree(filter->preds); + filter->preds = NULL; + filter->n_preds = 0; + } + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + if (!call->define_fields) + continue; + + if (!strcmp(call->system, system->name)) { + filter_disable_preds(call); + remove_filter_string(call->filter); } } + mutex_unlock(&event_mutex); +} + +static int filter_add_pred_fn(struct filter_parse_state *ps, + struct ftrace_event_call *call, + struct filter_pred *pred, + filter_pred_fn_t fn) +{ + struct event_filter *filter = call->filter; + int idx, err; + + if (filter->n_preds == MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + + idx = filter->n_preds; + filter_clear_pred(filter->preds[idx]); + err = filter_set_pred(filter->preds[idx], pred, fn); + if (err) + return err; - return -ENOSPC; + filter->n_preds++; + call->filter_active = 1; + + return 0; } +enum { + FILTER_STATIC_STRING = 1, + FILTER_DYN_STRING +}; + static int is_string_field(const char *type) { + if (strstr(type, "__data_loc") && strstr(type, "char")) + return FILTER_DYN_STRING; + if (strchr(type, '[') && strstr(type, "char")) - return 1; + return FILTER_STATIC_STRING; return 0; } -int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) +static int is_legal_op(struct ftrace_event_field *field, int op) { - struct ftrace_event_field *field; - - field = find_event_field(call, pred->field_name); - if (!field) - return -EINVAL; + if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) + return 0; - pred->offset = field->offset; + return 1; +} - if (is_string_field(field->type)) { - if (!pred->str_val) - return -EINVAL; - pred->fn = filter_pred_string; - pred->str_len = field->size; - return __filter_add_pred(call, pred); - } else { - if (pred->str_val) - return -EINVAL; - } +static filter_pred_fn_t select_comparison_fn(int op, int field_size, + int field_is_signed) +{ + filter_pred_fn_t fn = NULL; - switch (field->size) { + switch (field_size) { case 8: - pred->fn = filter_pred_64; + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_64; + else if (field_is_signed) + fn = filter_pred_s64; + else + fn = filter_pred_u64; break; case 4: - pred->fn = filter_pred_32; + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_32; + else if (field_is_signed) + fn = filter_pred_s32; + else + fn = filter_pred_u32; break; case 2: - pred->fn = filter_pred_16; + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_16; + else if (field_is_signed) + fn = filter_pred_s16; + else + fn = filter_pred_u16; break; case 1: - pred->fn = filter_pred_8; + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_8; + else if (field_is_signed) + fn = filter_pred_s8; + else + fn = filter_pred_u8; break; - default: - return -EINVAL; } - return __filter_add_pred(call, pred); + return fn; } -static struct filter_pred *copy_pred(struct filter_pred *pred) +static int filter_add_pred(struct filter_parse_state *ps, + struct ftrace_event_call *call, + struct filter_pred *pred) { - struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL); - if (!new_pred) - return NULL; + struct ftrace_event_field *field; + filter_pred_fn_t fn; + unsigned long long val; + int string_type; + + pred->fn = filter_pred_none; + + if (pred->op == OP_AND) { + pred->pop_n = 2; + return filter_add_pred_fn(ps, call, pred, filter_pred_and); + } else if (pred->op == OP_OR) { + pred->pop_n = 2; + return filter_add_pred_fn(ps, call, pred, filter_pred_or); + } + + field = find_event_field(call, pred->field_name); + if (!field) { + parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); + return -EINVAL; + } - memcpy(new_pred, pred, sizeof(*pred)); + pred->offset = field->offset; - if (pred->field_name) { - new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); - if (!new_pred->field_name) { - kfree(new_pred); - return NULL; - } + if (!is_legal_op(field, pred->op)) { + parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0); + return -EINVAL; } - if (pred->str_val) { - new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL); - if (!new_pred->str_val) { - filter_free_pred(new_pred); - return NULL; + string_type = is_string_field(field->type); + if (string_type) { + if (string_type == FILTER_STATIC_STRING) + fn = filter_pred_string; + else + fn = filter_pred_strloc; + pred->str_len = field->size; + if (pred->op == OP_NE) + pred->not = 1; + return filter_add_pred_fn(ps, call, pred, fn); + } else { + if (strict_strtoull(pred->str_val, 0, &val)) { + parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); + return -EINVAL; } + pred->val = val; + } + + fn = select_comparison_fn(pred->op, field->size, field->is_signed); + if (!fn) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); + return -EINVAL; } - return new_pred; + if (pred->op == OP_NE) + pred->not = 1; + + return filter_add_pred_fn(ps, call, pred, fn); } -int filter_add_subsystem_pred(struct event_subsystem *system, - struct filter_pred *pred) +static int filter_add_subsystem_pred(struct filter_parse_state *ps, + struct event_subsystem *system, + struct filter_pred *pred, + char *filter_string) { - struct ftrace_event_call *call = __start_ftrace_events; - struct filter_pred *event_pred; - int i; - - if (system->preds && !pred->compound) - filter_free_subsystem_preds(system); + struct event_filter *filter = system->filter; + struct ftrace_event_call *call; + int err = 0; - if (!system->preds) { - system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), + if (!filter->preds) { + filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!system->preds) + + if (!filter->preds) return -ENOMEM; } - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (!system->preds[i]) { - system->preds[i] = pred; - break; - } + if (filter->n_preds == MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; } - if (i == MAX_FILTER_PRED) - return -ENOSPC; + filter->preds[filter->n_preds] = pred; + filter->n_preds++; - events_for_each(call) { - int err; + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->regfunc) + if (!call->define_fields) continue; if (strcmp(call->system, system->name)) continue; - if (!find_event_field(call, pred->field_name)) - continue; + err = filter_add_pred(ps, call, pred); + if (err) { + mutex_unlock(&event_mutex); + filter_free_subsystem_preds(system); + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + goto out; + } + replace_filter_string(call->filter, filter_string); + } + mutex_unlock(&event_mutex); +out: + return err; +} - event_pred = copy_pred(pred); - if (!event_pred) - goto oom; +static void parse_init(struct filter_parse_state *ps, + struct filter_op *ops, + char *infix_string) +{ + memset(ps, '\0', sizeof(*ps)); - err = filter_add_pred(call, event_pred); - if (err) - filter_free_pred(event_pred); - if (err == -ENOMEM) - goto oom; + ps->infix.string = infix_string; + ps->infix.cnt = strlen(infix_string); + ps->ops = ops; + + INIT_LIST_HEAD(&ps->opstack); + INIT_LIST_HEAD(&ps->postfix); +} + +static char infix_next(struct filter_parse_state *ps) +{ + ps->infix.cnt--; + + return ps->infix.string[ps->infix.tail++]; +} + +static char infix_peek(struct filter_parse_state *ps) +{ + if (ps->infix.tail == strlen(ps->infix.string)) + return 0; + + return ps->infix.string[ps->infix.tail]; +} + +static void infix_advance(struct filter_parse_state *ps) +{ + ps->infix.cnt--; + ps->infix.tail++; +} + +static inline int is_precedence_lower(struct filter_parse_state *ps, + int a, int b) +{ + return ps->ops[a].precedence < ps->ops[b].precedence; +} + +static inline int is_op_char(struct filter_parse_state *ps, char c) +{ + int i; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (ps->ops[i].string[0] == c) + return 1; } return 0; +} -oom: - system->preds[i] = NULL; - return -ENOMEM; +static int infix_get_op(struct filter_parse_state *ps, char firstc) +{ + char nextc = infix_peek(ps); + char opstr[3]; + int i; + + opstr[0] = firstc; + opstr[1] = nextc; + opstr[2] = '\0'; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (!strcmp(opstr, ps->ops[i].string)) { + infix_advance(ps); + return ps->ops[i].id; + } + } + + opstr[1] = '\0'; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (!strcmp(opstr, ps->ops[i].string)) + return ps->ops[i].id; + } + + return OP_NONE; } -int filter_parse(char **pbuf, struct filter_pred *pred) +static inline void clear_operand_string(struct filter_parse_state *ps) { - char *tmp, *tok, *val_str = NULL; - int tok_n = 0; + memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL); + ps->operand.tail = 0; +} - /* field ==/!= number, or/and field ==/!= number, number */ - while ((tok = strsep(pbuf, " \n"))) { - if (tok_n == 0) { - if (!strcmp(tok, "0")) { - pred->clear = 1; - return 0; - } else if (!strcmp(tok, "&&")) { - pred->or = 0; - pred->compound = 1; - } else if (!strcmp(tok, "||")) { - pred->or = 1; - pred->compound = 1; - } else - pred->field_name = tok; - tok_n = 1; +static inline int append_operand_char(struct filter_parse_state *ps, char c) +{ + if (ps->operand.tail == MAX_FILTER_STR_VAL - 1) + return -EINVAL; + + ps->operand.string[ps->operand.tail++] = c; + + return 0; +} + +static int filter_opstack_push(struct filter_parse_state *ps, int op) +{ + struct opstack_op *opstack_op; + + opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL); + if (!opstack_op) + return -ENOMEM; + + opstack_op->op = op; + list_add(&opstack_op->list, &ps->opstack); + + return 0; +} + +static int filter_opstack_empty(struct filter_parse_state *ps) +{ + return list_empty(&ps->opstack); +} + +static int filter_opstack_top(struct filter_parse_state *ps) +{ + struct opstack_op *opstack_op; + + if (filter_opstack_empty(ps)) + return OP_NONE; + + opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + + return opstack_op->op; +} + +static int filter_opstack_pop(struct filter_parse_state *ps) +{ + struct opstack_op *opstack_op; + int op; + + if (filter_opstack_empty(ps)) + return OP_NONE; + + opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + op = opstack_op->op; + list_del(&opstack_op->list); + + kfree(opstack_op); + + return op; +} + +static void filter_opstack_clear(struct filter_parse_state *ps) +{ + while (!filter_opstack_empty(ps)) + filter_opstack_pop(ps); +} + +static char *curr_operand(struct filter_parse_state *ps) +{ + return ps->operand.string; +} + +static int postfix_append_operand(struct filter_parse_state *ps, char *operand) +{ + struct postfix_elt *elt; + + elt = kmalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return -ENOMEM; + + elt->op = OP_NONE; + elt->operand = kstrdup(operand, GFP_KERNEL); + if (!elt->operand) { + kfree(elt); + return -ENOMEM; + } + + list_add_tail(&elt->list, &ps->postfix); + + return 0; +} + +static int postfix_append_op(struct filter_parse_state *ps, int op) +{ + struct postfix_elt *elt; + + elt = kmalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return -ENOMEM; + + elt->op = op; + elt->operand = NULL; + + list_add_tail(&elt->list, &ps->postfix); + + return 0; +} + +static void postfix_clear(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + + while (!list_empty(&ps->postfix)) { + elt = list_first_entry(&ps->postfix, struct postfix_elt, list); + kfree(elt->operand); + list_del(&elt->list); + } +} + +static int filter_parse(struct filter_parse_state *ps) +{ + int in_string = 0; + int op, top_op; + char ch; + + while ((ch = infix_next(ps))) { + if (ch == '"') { + in_string ^= 1; continue; } - if (tok_n == 1) { - if (!pred->field_name) - pred->field_name = tok; - else if (!strcmp(tok, "!=")) - pred->not = 1; - else if (!strcmp(tok, "==")) - pred->not = 0; - else { - pred->field_name = NULL; + + if (in_string) + goto parse_operand; + + if (isspace(ch)) + continue; + + if (is_op_char(ps, ch)) { + op = infix_get_op(ps, ch); + if (op == OP_NONE) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); return -EINVAL; } - tok_n = 2; + + if (strlen(curr_operand(ps))) { + postfix_append_operand(ps, curr_operand(ps)); + clear_operand_string(ps); + } + + while (!filter_opstack_empty(ps)) { + top_op = filter_opstack_top(ps); + if (!is_precedence_lower(ps, top_op, op)) { + top_op = filter_opstack_pop(ps); + postfix_append_op(ps, top_op); + continue; + } + break; + } + + filter_opstack_push(ps, op); continue; } - if (tok_n == 2) { - if (pred->compound) { - if (!strcmp(tok, "!=")) - pred->not = 1; - else if (!strcmp(tok, "==")) - pred->not = 0; - else { - pred->field_name = NULL; - return -EINVAL; - } - } else { - val_str = tok; - break; /* done */ + + if (ch == '(') { + filter_opstack_push(ps, OP_OPEN_PAREN); + continue; + } + + if (ch == ')') { + if (strlen(curr_operand(ps))) { + postfix_append_operand(ps, curr_operand(ps)); + clear_operand_string(ps); + } + + top_op = filter_opstack_pop(ps); + while (top_op != OP_NONE) { + if (top_op == OP_OPEN_PAREN) + break; + postfix_append_op(ps, top_op); + top_op = filter_opstack_pop(ps); + } + if (top_op == OP_NONE) { + parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); + return -EINVAL; } - tok_n = 3; continue; } - if (tok_n == 3) { - val_str = tok; - break; /* done */ +parse_operand: + if (append_operand_char(ps, ch)) { + parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); + return -EINVAL; } } - if (!val_str) { - pred->field_name = NULL; - return -EINVAL; + if (strlen(curr_operand(ps))) + postfix_append_operand(ps, curr_operand(ps)); + + while (!filter_opstack_empty(ps)) { + top_op = filter_opstack_pop(ps); + if (top_op == OP_NONE) + break; + if (top_op == OP_OPEN_PAREN) { + parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); + return -EINVAL; + } + postfix_append_op(ps, top_op); } - pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); - if (!pred->field_name) - return -ENOMEM; + return 0; +} - pred->val = simple_strtoull(val_str, &tmp, 0); - if (tmp == val_str) { - pred->str_val = kstrdup(val_str, GFP_KERNEL); - if (!pred->str_val) - return -ENOMEM; - } else if (*tmp != '\0') +static struct filter_pred *create_pred(int op, char *operand1, char *operand2) +{ + struct filter_pred *pred; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return NULL; + + pred->field_name = kstrdup(operand1, GFP_KERNEL); + if (!pred->field_name) { + kfree(pred); + return NULL; + } + + strcpy(pred->str_val, operand2); + pred->str_len = strlen(operand2); + + pred->op = op; + + return pred; +} + +static struct filter_pred *create_logical_pred(int op) +{ + struct filter_pred *pred; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return NULL; + + pred->op = op; + + return pred; +} + +static int check_preds(struct filter_parse_state *ps) +{ + int n_normal_preds = 0, n_logical_preds = 0; + struct postfix_elt *elt; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + + if (elt->op == OP_AND || elt->op == OP_OR) { + n_logical_preds++; + continue; + } + n_normal_preds++; + } + + if (!n_normal_preds || n_logical_preds >= n_normal_preds) { + parse_error(ps, FILT_ERR_INVALID_FILTER, 0); return -EINVAL; + } return 0; } +static int replace_preds(struct event_subsystem *system, + struct ftrace_event_call *call, + struct filter_parse_state *ps, + char *filter_string) +{ + char *operand1 = NULL, *operand2 = NULL; + struct filter_pred *pred; + struct postfix_elt *elt; + int err; + + err = check_preds(ps); + if (err) + return err; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) { + if (!operand1) + operand1 = elt->operand; + else if (!operand2) + operand2 = elt->operand; + else { + parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); + return -EINVAL; + } + continue; + } + + if (elt->op == OP_AND || elt->op == OP_OR) { + pred = create_logical_pred(elt->op); + if (call) { + err = filter_add_pred(ps, call, pred); + filter_free_pred(pred); + } else + err = filter_add_subsystem_pred(ps, system, + pred, filter_string); + if (err) + return err; + + operand1 = operand2 = NULL; + continue; + } + + if (!operand1 || !operand2) { + parse_error(ps, FILT_ERR_MISSING_FIELD, 0); + return -EINVAL; + } + + pred = create_pred(elt->op, operand1, operand2); + if (call) { + err = filter_add_pred(ps, call, pred); + filter_free_pred(pred); + } else + err = filter_add_subsystem_pred(ps, system, pred, + filter_string); + if (err) + return err; + + operand1 = operand2 = NULL; + } + + return 0; +} + +int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +{ + int err; + + struct filter_parse_state *ps; + + mutex_lock(&filter_mutex); + + if (!strcmp(strstrip(filter_string), "0")) { + filter_disable_preds(call); + remove_filter_string(call->filter); + mutex_unlock(&filter_mutex); + return 0; + } + + err = -ENOMEM; + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto out_unlock; + + filter_disable_preds(call); + replace_filter_string(call->filter, filter_string); + + parse_init(ps, filter_ops, filter_string); + err = filter_parse(ps); + if (err) { + append_filter_err(ps, call->filter); + goto out; + } + + err = replace_preds(NULL, call, ps, filter_string); + if (err) + append_filter_err(ps, call->filter); + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); +out_unlock: + mutex_unlock(&filter_mutex); + + return err; +} + +int apply_subsystem_event_filter(struct event_subsystem *system, + char *filter_string) +{ + int err; + + struct filter_parse_state *ps; + + mutex_lock(&filter_mutex); + + if (!strcmp(strstrip(filter_string), "0")) { + filter_free_subsystem_preds(system); + remove_filter_string(system->filter); + mutex_unlock(&filter_mutex); + return 0; + } + + err = -ENOMEM; + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto out_unlock; + + filter_free_subsystem_preds(system); + replace_filter_string(system->filter, filter_string); + + parse_init(ps, filter_ops, filter_string); + err = filter_parse(ps); + if (err) { + append_filter_err(ps, system->filter); + goto out; + } + + err = replace_preds(system, NULL, ps, filter_string); + if (err) + append_filter_err(ps, system->filter); + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); +out_unlock: + mutex_unlock(&filter_mutex); + + return err; +} diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h deleted file mode 100644 index 38985f9..0000000 --- a/kernel/trace/trace_events_stage_1.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Stage 1 of the trace events. - * - * Override the macros in <trace/trace_event_types.h> to include the following: - * - * struct ftrace_raw_<call> { - * struct trace_entry ent; - * <type> <item>; - * <type2> <item2>[<len>]; - * [...] - * }; - * - * The <type> <item> is created by the __field(type, item) macro or - * the __array(type2, item2, len) macro. - * We simply do "type item;", and that will create the fields - * in the structure. - */ - -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) - -#undef __array -#define __array(type, item, len) type item[len]; - -#undef __field -#define __field(type, item) type item; - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef TRACE_EVENT -#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ - struct ftrace_raw_##name { \ - struct trace_entry ent; \ - tstruct \ - }; \ - static struct ftrace_event_call event_##name - -#include <trace/trace_event_types.h> diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h deleted file mode 100644 index d363c66..0000000 --- a/kernel/trace/trace_events_stage_2.h +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Stage 2 of the trace events. - * - * Override the macros in <trace/trace_event_types.h> to include the following: - * - * enum print_line_t - * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags) - * { - * struct trace_seq *s = &iter->seq; - * struct ftrace_raw_<call> *field; <-- defined in stage 1 - * struct trace_entry *entry; - * int ret; - * - * entry = iter->ent; - * - * if (entry->type != event_<call>.id) { - * WARN_ON_ONCE(1); - * return TRACE_TYPE_UNHANDLED; - * } - * - * field = (typeof(field))entry; - * - * ret = trace_seq_printf(s, <TP_printk> "\n"); - * if (!ret) - * return TRACE_TYPE_PARTIAL_LINE; - * - * return TRACE_TYPE_HANDLED; - * } - * - * This is the method used to print the raw event to the trace - * output format. Note, this is not needed if the data is read - * in binary. - */ - -#undef __entry -#define __entry field - -#undef TP_printk -#define TP_printk(fmt, args...) fmt "\n", args - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -enum print_line_t \ -ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ -{ \ - struct trace_seq *s = &iter->seq; \ - struct ftrace_raw_##call *field; \ - struct trace_entry *entry; \ - int ret; \ - \ - entry = iter->ent; \ - \ - if (entry->type != event_##call.id) { \ - WARN_ON_ONCE(1); \ - return TRACE_TYPE_UNHANDLED; \ - } \ - \ - field = (typeof(field))entry; \ - \ - ret = trace_seq_printf(s, #call ": " print); \ - if (!ret) \ - return TRACE_TYPE_PARTIAL_LINE; \ - \ - return TRACE_TYPE_HANDLED; \ -} - -#include <trace/trace_event_types.h> - -/* - * Setup the showing format of trace point. - * - * int - * ftrace_format_##call(struct trace_seq *s) - * { - * struct ftrace_raw_##call field; - * int ret; - * - * ret = trace_seq_printf(s, #type " " #item ";" - * " offset:%u; size:%u;\n", - * offsetof(struct ftrace_raw_##call, item), - * sizeof(field.type)); - * - * } - */ - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef __entry -#define __entry REC - -#undef TP_printk -#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) - -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ -static int \ -ftrace_format_##call(struct trace_seq *s) \ -{ \ - struct ftrace_raw_##call field; \ - int ret; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include <trace/trace_event_types.h> - -#undef __field -#define __field(type, item) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ - if (ret) \ - return ret; - -#undef __array -#define __array(type, item, len) \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ - if (ret) \ - return ret; - -#define __common_field(type, item) \ - ret = trace_define_field(event_call, #type, "common_" #item, \ - offsetof(typeof(field.ent), item), \ - sizeof(field.ent.item)); \ - if (ret) \ - return ret; - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ -int \ -ftrace_define_fields_##call(void) \ -{ \ - struct ftrace_raw_##call field; \ - struct ftrace_event_call *event_call = &event_##call; \ - int ret; \ - \ - __common_field(unsigned char, type); \ - __common_field(unsigned char, flags); \ - __common_field(unsigned char, preempt_count); \ - __common_field(int, pid); \ - __common_field(int, tgid); \ - \ - tstruct; \ - \ - return ret; \ -} - -#include <trace/trace_event_types.h> diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h deleted file mode 100644 index 9d2fa78..0000000 --- a/kernel/trace/trace_events_stage_3.h +++ /dev/null @@ -1,281 +0,0 @@ -/* - * Stage 3 of the trace events. - * - * Override the macros in <trace/trace_event_types.h> to include the following: - * - * static void ftrace_event_<call>(proto) - * { - * event_trace_printk(_RET_IP_, "<call>: " <fmt>); - * } - * - * static int ftrace_reg_event_<call>(void) - * { - * int ret; - * - * ret = register_trace_<call>(ftrace_event_<call>); - * if (!ret) - * pr_info("event trace: Could not activate trace point " - * "probe to <call>"); - * return ret; - * } - * - * static void ftrace_unreg_event_<call>(void) - * { - * unregister_trace_<call>(ftrace_event_<call>); - * } - * - * For those macros defined with TRACE_FORMAT: - * - * static struct ftrace_event_call __used - * __attribute__((__aligned__(4))) - * __attribute__((section("_ftrace_events"))) event_<call> = { - * .name = "<call>", - * .regfunc = ftrace_reg_event_<call>, - * .unregfunc = ftrace_unreg_event_<call>, - * } - * - * - * For those macros defined with TRACE_EVENT: - * - * static struct ftrace_event_call event_<call>; - * - * static void ftrace_raw_event_<call>(proto) - * { - * struct ring_buffer_event *event; - * struct ftrace_raw_<call> *entry; <-- defined in stage 1 - * unsigned long irq_flags; - * int pc; - * - * local_save_flags(irq_flags); - * pc = preempt_count(); - * - * event = trace_current_buffer_lock_reserve(event_<call>.id, - * sizeof(struct ftrace_raw_<call>), - * irq_flags, pc); - * if (!event) - * return; - * entry = ring_buffer_event_data(event); - * - * <assign>; <-- Here we assign the entries by the __field and - * __array macros. - * - * trace_current_buffer_unlock_commit(event, irq_flags, pc); - * } - * - * static int ftrace_raw_reg_event_<call>(void) - * { - * int ret; - * - * ret = register_trace_<call>(ftrace_raw_event_<call>); - * if (!ret) - * pr_info("event trace: Could not activate trace point " - * "probe to <call>"); - * return ret; - * } - * - * static void ftrace_unreg_event_<call>(void) - * { - * unregister_trace_<call>(ftrace_raw_event_<call>); - * } - * - * static struct trace_event ftrace_event_type_<call> = { - * .trace = ftrace_raw_output_<call>, <-- stage 2 - * }; - * - * static int ftrace_raw_init_event_<call>(void) - * { - * int id; - * - * id = register_ftrace_event(&ftrace_event_type_<call>); - * if (!id) - * return -ENODEV; - * event_<call>.id = id; - * return 0; - * } - * - * static struct ftrace_event_call __used - * __attribute__((__aligned__(4))) - * __attribute__((section("_ftrace_events"))) event_<call> = { - * .name = "<call>", - * .system = "<system>", - * .raw_init = ftrace_raw_init_event_<call>, - * .regfunc = ftrace_reg_event_<call>, - * .unregfunc = ftrace_unreg_event_<call>, - * .show_format = ftrace_format_<call>, - * } - * - */ - -#undef TP_FMT -#define TP_FMT(fmt, args...) fmt "\n", ##args - -#ifdef CONFIG_EVENT_PROFILE -#define _TRACE_PROFILE(call, proto, args) \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int); \ - perf_tpcounter_event(event_##call.id); \ -} \ - \ -static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \ -{ \ - int ret = 0; \ - \ - if (!atomic_inc_return(&call->profile_count)) \ - ret = register_trace_##call(ftrace_profile_##call); \ - \ - return ret; \ -} \ - \ -static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \ -{ \ - if (atomic_add_negative(-1, &call->profile_count)) \ - unregister_trace_##call(ftrace_profile_##call); \ -} - -#define _TRACE_PROFILE_INIT(call) \ - .profile_count = ATOMIC_INIT(-1), \ - .profile_enable = ftrace_profile_enable_##call, \ - .profile_disable = ftrace_profile_disable_##call, - -#else -#define _TRACE_PROFILE(call, proto, args) -#define _TRACE_PROFILE_INIT(call) -#endif - -#define _TRACE_FORMAT(call, proto, args, fmt) \ -static void ftrace_event_##call(proto) \ -{ \ - event_trace_printk(_RET_IP_, #call ": " fmt); \ -} \ - \ -static int ftrace_reg_event_##call(void) \ -{ \ - int ret; \ - \ - ret = register_trace_##call(ftrace_event_##call); \ - if (ret) \ - pr_info("event trace: Could not activate trace point " \ - "probe to " #call "\n"); \ - return ret; \ -} \ - \ -static void ftrace_unreg_event_##call(void) \ -{ \ - unregister_trace_##call(ftrace_event_##call); \ -} \ - \ -static struct ftrace_event_call event_##call; \ - \ -static int ftrace_init_event_##call(void) \ -{ \ - int id; \ - \ - id = register_ftrace_event(NULL); \ - if (!id) \ - return -ENODEV; \ - event_##call.id = id; \ - return 0; \ -} - -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) \ -_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ -static struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_init_event_##call, \ - .regfunc = ftrace_reg_event_##call, \ - .unregfunc = ftrace_unreg_event_##call, \ - _TRACE_PROFILE_INIT(call) \ -} - -#undef __entry -#define __entry entry - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ - \ -static struct ftrace_event_call event_##call; \ - \ -static void ftrace_raw_event_##call(proto) \ -{ \ - struct ftrace_event_call *call = &event_##call; \ - struct ring_buffer_event *event; \ - struct ftrace_raw_##call *entry; \ - unsigned long irq_flags; \ - int pc; \ - \ - local_save_flags(irq_flags); \ - pc = preempt_count(); \ - \ - event = trace_current_buffer_lock_reserve(event_##call.id, \ - sizeof(struct ftrace_raw_##call), \ - irq_flags, pc); \ - if (!event) \ - return; \ - entry = ring_buffer_event_data(event); \ - \ - assign; \ - \ - if (call->preds && !filter_match_preds(call, entry)) \ - ring_buffer_event_discard(event); \ - \ - trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ - \ -} \ - \ -static int ftrace_raw_reg_event_##call(void) \ -{ \ - int ret; \ - \ - ret = register_trace_##call(ftrace_raw_event_##call); \ - if (ret) \ - pr_info("event trace: Could not activate trace point " \ - "probe to " #call "\n"); \ - return ret; \ -} \ - \ -static void ftrace_raw_unreg_event_##call(void) \ -{ \ - unregister_trace_##call(ftrace_raw_event_##call); \ -} \ - \ -static struct trace_event ftrace_event_type_##call = { \ - .trace = ftrace_raw_output_##call, \ -}; \ - \ -static int ftrace_raw_init_event_##call(void) \ -{ \ - int id; \ - \ - id = register_ftrace_event(&ftrace_event_type_##call); \ - if (!id) \ - return -ENODEV; \ - event_##call.id = id; \ - INIT_LIST_HEAD(&event_##call.fields); \ - return 0; \ -} \ - \ -static struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_raw_init_event_##call, \ - .regfunc = ftrace_raw_reg_event_##call, \ - .unregfunc = ftrace_raw_unreg_event_##call, \ - .show_format = ftrace_format_##call, \ - .define_fields = ftrace_define_fields_##call, \ - _TRACE_PROFILE_INIT(call) \ -} - -#include <trace/trace_event_types.h> - -#undef _TRACE_PROFILE -#undef _TRACE_PROFILE_INIT - diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 07a22c3..d06cf89 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -19,8 +19,12 @@ #undef TRACE_STRUCT #define TRACE_STRUCT(args...) args +extern void __bad_type_size(void); + #undef TRACE_FIELD #define TRACE_FIELD(type, item, assign) \ + if (sizeof(type) != sizeof(field.item)) \ + __bad_type_size(); \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), item), \ @@ -30,7 +34,7 @@ #undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ +#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), item), \ @@ -46,6 +50,9 @@ if (!ret) \ return 0; +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + TRACE_FIELD(type, item, assign) #undef TP_RAW_FMT #define TP_RAW_FMT(args...) args @@ -65,6 +72,22 @@ ftrace_format_##call(struct trace_seq *s) \ return ret; \ } +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) \ +static int \ +ftrace_format_##call(struct trace_seq *s) \ +{ \ + struct args field; \ + int ret; \ + \ + tstruct; \ + \ + trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ + \ + return ret; \ +} + #include "trace_event_types.h" #undef TRACE_ZERO_CHAR @@ -78,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_FIELD(type, item, assign)\ entry->item = assign; +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + TRACE_FIELD(type, item, assign) + #undef TP_CMD #define TP_CMD(cmd...) cmd @@ -85,18 +112,95 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_ENTRY entry #undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ +#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ cmd; #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +int ftrace_define_fields_##call(void); \ +static int ftrace_raw_init_event_##call(void); \ + \ +struct ftrace_event_call __used \ +__attribute__((__aligned__(4))) \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .id = proto, \ + .system = __stringify(TRACE_SYSTEM), \ + .raw_init = ftrace_raw_init_event_##call, \ + .show_format = ftrace_format_##call, \ + .define_fields = ftrace_define_fields_##call, \ +}; \ +static int ftrace_raw_init_event_##call(void) \ +{ \ + INIT_LIST_HEAD(&event_##call.fields); \ + init_preds(&event_##call); \ + return 0; \ +} \ + +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) \ \ -static struct ftrace_event_call __used \ +struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .id = proto, \ .system = __stringify(TRACE_SYSTEM), \ .show_format = ftrace_format_##call, \ +}; + +#include "trace_event_types.h" + +#undef TRACE_FIELD +#define TRACE_FIELD(type, item, assign) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed_type(type)); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_SPECIAL +#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), 0); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_ZERO_CHAR +#define TRACE_FIELD_ZERO_CHAR(item) + +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +int \ +ftrace_define_fields_##call(void) \ +{ \ + struct ftrace_event_call *event_call = &event_##call; \ + struct args field; \ + int ret; \ + \ + __common_field(unsigned char, type, 0); \ + __common_field(unsigned char, flags, 0); \ + __common_field(unsigned char, preempt_count, 0); \ + __common_field(int, pid, 1); \ + __common_field(int, tgid, 1); \ + \ + tstruct; \ + \ + return ret; \ } + +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) + #include "trace_event_types.h" diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d28687e..8b59241 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -65,6 +65,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) if (!current->ret_stack) return -EBUSY; + /* + * We must make sure the ret_stack is tested before we read + * anything else. + */ + smp_rmb(); + /* The return trace stack is full */ if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { atomic_inc(¤t->trace_overrun); @@ -78,13 +84,14 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; + current->ret_stack[index].subtime = 0; *depth = index; return 0; } /* Retrieve a function return address to the trace stack on thread info.*/ -void +static void ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) { int index; @@ -104,9 +111,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); trace->depth = index; - barrier(); - current->curr_ret_stack--; - } /* @@ -121,6 +125,8 @@ unsigned long ftrace_return_to_handler(void) ftrace_pop_return_trace(&trace, &ret); trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); + barrier(); + current->curr_ret_stack--; if (unlikely(!ret)) { ftrace_graph_stop(); @@ -426,8 +432,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, return TRACE_TYPE_HANDLED; } -static enum print_line_t -print_graph_duration(unsigned long long duration, struct trace_seq *s) +enum print_line_t +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) { unsigned long nsecs_rem = do_div(duration, 1000); /* log10(ULONG_MAX) + '\0' */ @@ -464,12 +470,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s) if (!ret) return TRACE_TYPE_PARTIAL_LINE; } + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_duration(unsigned long long duration, struct trace_seq *s) +{ + int ret; + + ret = trace_print_graph_duration(duration, s); + if (ret != TRACE_TYPE_HANDLED) + return ret; ret = trace_seq_printf(s, "| "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; + return TRACE_TYPE_HANDLED; } /* Case of a leaf function on its call entry */ diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 7bfdf4c..ca7d7c4 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -1,10 +1,9 @@ /* - * h/w branch tracer for x86 based on bts + * h/w branch tracer for x86 based on BTS * * Copyright (C) 2008-2009 Intel Corporation. * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009 */ -#include <linux/spinlock.h> #include <linux/kallsyms.h> #include <linux/debugfs.h> #include <linux/ftrace.h> @@ -15,110 +14,119 @@ #include <asm/ds.h> -#include "trace.h" #include "trace_output.h" +#include "trace.h" -#define SIZEOF_BTS (1 << 13) +#define BTS_BUFFER_SIZE (1 << 13) -/* - * The tracer lock protects the below per-cpu tracer array. - * It needs to be held to: - * - start tracing on all cpus - * - stop tracing on all cpus - * - start tracing on a single hotplug cpu - * - stop tracing on a single hotplug cpu - * - read the trace from all cpus - * - read the trace from a single cpu - */ -static DEFINE_SPINLOCK(bts_tracer_lock); static DEFINE_PER_CPU(struct bts_tracer *, tracer); -static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); +static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); #define this_tracer per_cpu(tracer, smp_processor_id()) -#define this_buffer per_cpu(buffer, smp_processor_id()) -static int __read_mostly trace_hw_branches_enabled; +static int trace_hw_branches_enabled __read_mostly; +static int trace_hw_branches_suspended __read_mostly; static struct trace_array *hw_branch_trace __read_mostly; -/* - * Start tracing on the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_start_cpu(void *arg) +static void bts_trace_init_cpu(int cpu) { - if (this_tracer) - ds_release_bts(this_tracer); - - this_tracer = - ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, - /* ovfl = */ NULL, /* th = */ (size_t)-1, - BTS_KERNEL); - if (IS_ERR(this_tracer)) { - this_tracer = NULL; - return; - } + per_cpu(tracer, cpu) = + ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + + if (IS_ERR(per_cpu(tracer, cpu))) + per_cpu(tracer, cpu) = NULL; } -static void bts_trace_start(struct trace_array *tr) +static int bts_trace_init(struct trace_array *tr) { - spin_lock(&bts_tracer_lock); + int cpu; + + hw_branch_trace = tr; + trace_hw_branches_enabled = 0; - on_each_cpu(bts_trace_start_cpu, NULL, 1); - trace_hw_branches_enabled = 1; + get_online_cpus(); + for_each_online_cpu(cpu) { + bts_trace_init_cpu(cpu); - spin_unlock(&bts_tracer_lock); + if (likely(per_cpu(tracer, cpu))) + trace_hw_branches_enabled = 1; + } + trace_hw_branches_suspended = 0; + put_online_cpus(); + + /* If we could not enable tracing on a single cpu, we fail. */ + return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP; } -/* - * Stop tracing on the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_stop_cpu(void *arg) +static void bts_trace_reset(struct trace_array *tr) { - if (this_tracer) { - ds_release_bts(this_tracer); - this_tracer = NULL; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + if (likely(per_cpu(tracer, cpu))) { + ds_release_bts(per_cpu(tracer, cpu)); + per_cpu(tracer, cpu) = NULL; + } } + trace_hw_branches_enabled = 0; + trace_hw_branches_suspended = 0; + put_online_cpus(); } -static void bts_trace_stop(struct trace_array *tr) +static void bts_trace_start(struct trace_array *tr) { - spin_lock(&bts_tracer_lock); + int cpu; - trace_hw_branches_enabled = 0; - on_each_cpu(bts_trace_stop_cpu, NULL, 1); + get_online_cpus(); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_resume_bts(per_cpu(tracer, cpu)); + trace_hw_branches_suspended = 0; + put_online_cpus(); +} - spin_unlock(&bts_tracer_lock); +static void bts_trace_stop(struct trace_array *tr) +{ + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_suspend_bts(per_cpu(tracer, cpu)); + trace_hw_branches_suspended = 1; + put_online_cpus(); } static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned int cpu = (unsigned long)hcpu; - - spin_lock(&bts_tracer_lock); - - if (!trace_hw_branches_enabled) - goto out; + int cpu = (long)hcpu; switch (action) { case CPU_ONLINE: case CPU_DOWN_FAILED: - smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); + /* The notification is sent with interrupts enabled. */ + if (trace_hw_branches_enabled) { + bts_trace_init_cpu(cpu); + + if (trace_hw_branches_suspended && + likely(per_cpu(tracer, cpu))) + ds_suspend_bts(per_cpu(tracer, cpu)); + } break; + case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); - break; + /* The notification is sent with interrupts enabled. */ + if (likely(per_cpu(tracer, cpu))) { + ds_release_bts(per_cpu(tracer, cpu)); + per_cpu(tracer, cpu) = NULL; + } } - out: - spin_unlock(&bts_tracer_lock); return NOTIFY_DONE; } @@ -126,20 +134,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { .notifier_call = bts_hotcpu_handler }; -static int bts_trace_init(struct trace_array *tr) -{ - hw_branch_trace = tr; - - bts_trace_start(tr); - - return 0; -} - -static void bts_trace_reset(struct trace_array *tr) -{ - bts_trace_stop(tr); -} - static void bts_trace_print_header(struct seq_file *m) { seq_puts(m, "# CPU# TO <- FROM\n"); @@ -147,10 +141,10 @@ static void bts_trace_print_header(struct seq_file *m) static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) { + unsigned long symflags = TRACE_ITER_SYM_OFFSET; struct trace_entry *entry = iter->ent; struct trace_seq *seq = &iter->seq; struct hw_branch_entry *it; - unsigned long symflags = TRACE_ITER_SYM_OFFSET; trace_assign_type(it, entry); @@ -168,6 +162,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) void trace_hw_branch(u64 from, u64 to) { + struct ftrace_event_call *call = &event_hw_branch; struct trace_array *tr = hw_branch_trace; struct ring_buffer_event *event; struct hw_branch_entry *entry; @@ -194,7 +189,8 @@ void trace_hw_branch(u64 from, u64 to) entry->ent.type = TRACE_HW_BRANCHES; entry->from = from; entry->to = to; - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: atomic_dec(&tr->data[cpu]->disabled); @@ -224,11 +220,11 @@ static void trace_bts_at(const struct bts_trace *trace, void *at) /* * Collect the trace on the current cpu and write it into the ftrace buffer. * - * pre: bts_tracer_lock must be locked + * pre: tracing must be suspended on the current cpu */ static void trace_bts_cpu(void *arg) { - struct trace_array *tr = (struct trace_array *) arg; + struct trace_array *tr = (struct trace_array *)arg; const struct bts_trace *trace; unsigned char *at; @@ -241,10 +237,9 @@ static void trace_bts_cpu(void *arg) if (unlikely(!this_tracer)) return; - ds_suspend_bts(this_tracer); trace = ds_read_bts(this_tracer); if (!trace) - goto out; + return; for (at = trace->ds.top; (void *)at < trace->ds.end; at += trace->ds.size) @@ -253,18 +248,27 @@ static void trace_bts_cpu(void *arg) for (at = trace->ds.begin; (void *)at < trace->ds.top; at += trace->ds.size) trace_bts_at(trace, at); - -out: - ds_resume_bts(this_tracer); } static void trace_bts_prepare(struct trace_iterator *iter) { - spin_lock(&bts_tracer_lock); + int cpu; + get_online_cpus(); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_suspend_bts(per_cpu(tracer, cpu)); + /* + * We need to collect the trace on the respective cpu since ftrace + * implicitly adds the record for the current cpu. + * Once that is more flexible, we could collect the data from any cpu. + */ on_each_cpu(trace_bts_cpu, iter->tr, 1); - spin_unlock(&bts_tracer_lock); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_resume_bts(per_cpu(tracer, cpu)); + put_online_cpus(); } static void trace_bts_close(struct trace_iterator *iter) @@ -274,11 +278,11 @@ static void trace_bts_close(struct trace_iterator *iter) void trace_hw_branch_oops(void) { - spin_lock(&bts_tracer_lock); - - trace_bts_cpu(hw_branch_trace); - - spin_unlock(&bts_tracer_lock); + if (this_tracer) { + ds_suspend_bts_noirq(this_tracer); + trace_bts_cpu(hw_branch_trace); + ds_resume_bts_noirq(this_tracer); + } } struct tracer bts_tracer __read_mostly = @@ -291,7 +295,10 @@ struct tracer bts_tracer __read_mostly = .start = bts_trace_start, .stop = bts_trace_stop, .open = trace_bts_prepare, - .close = trace_bts_close + .close = trace_bts_close, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_hw_branches, +#endif /* CONFIG_FTRACE_SELFTEST */ }; __init static int init_bts_trace(void) diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 8e37fcd..d53b45e 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -9,6 +9,8 @@ #include <linux/kernel.h> #include <linux/mmiotrace.h> #include <linux/pci.h> +#include <linux/time.h> + #include <asm/atomic.h> #include "trace.h" @@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) struct mmiotrace_rw *rw; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; int ret = 1; @@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) struct mmiotrace_map *m; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; int ret; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 64b54a5..7938f3a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,11 +14,25 @@ /* must be a power of 2 */ #define EVENT_HASHSIZE 128 -static DEFINE_MUTEX(trace_event_mutex); +DECLARE_RWSEM(trace_event_mutex); + +DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); +EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); + static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; +void trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + + s->buffer[len] = 0; + seq_puts(m, s->buffer); + + trace_seq_init(s); +} + enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -84,6 +98,39 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) return len; } +EXPORT_SYMBOL_GPL(trace_seq_printf); + +/** + * trace_seq_vprintf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +int +trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) +{ + int len = (PAGE_SIZE - 1) - s->len; + int ret; + + if (!len) + return 0; + + ret = vsnprintf(s->buffer + s->len, len, fmt, args); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) + return 0; + + s->len += ret; + + return len; +} +EXPORT_SYMBOL_GPL(trace_seq_vprintf); int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) { @@ -201,6 +248,67 @@ int trace_seq_path(struct trace_seq *s, struct path *path) return 0; } +const char * +ftrace_print_flags_seq(struct trace_seq *p, const char *delim, + unsigned long flags, + const struct trace_print_flags *flag_array) +{ + unsigned long mask; + const char *str; + const char *ret = p->buffer + p->len; + int i; + + for (i = 0; flag_array[i].name && flags; i++) { + + mask = flag_array[i].mask; + if ((flags & mask) != mask) + continue; + + str = flag_array[i].name; + flags &= ~mask; + if (p->len && delim) + trace_seq_puts(p, delim); + trace_seq_puts(p, str); + } + + /* check for left over flags */ + if (flags) { + if (p->len && delim) + trace_seq_puts(p, delim); + trace_seq_printf(p, "0x%lx", flags); + } + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_flags_seq); + +const char * +ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, + const struct trace_print_flags *symbol_array) +{ + int i; + const char *ret = p->buffer + p->len; + + for (i = 0; symbol_array[i].name; i++) { + + if (val != symbol_array[i].mask) + continue; + + trace_seq_puts(p, symbol_array[i].name); + break; + } + + if (!p->len) + trace_seq_printf(p, "0x%lx", val); + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_symbols_seq); + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { @@ -311,17 +419,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, if (ip == ULONG_MAX || !ret) break; - if (i && ret) - ret = trace_seq_puts(s, " <- "); + if (ret) + ret = trace_seq_puts(s, " => "); if (!ip) { if (ret) ret = trace_seq_puts(s, "??"); + if (ret) + ret = trace_seq_puts(s, "\n"); continue; } if (!ret) break; if (ret) ret = seq_print_user_ip(s, mm, ip, sym_flags); + ret = trace_seq_puts(s, "\n"); } if (mm) @@ -455,6 +566,7 @@ static int task_state_char(unsigned long state) * @type: the type of event to look for * * Returns an event of type @type otherwise NULL + * Called with trace_event_read_lock() held. */ struct trace_event *ftrace_find_event(int type) { @@ -464,7 +576,7 @@ struct trace_event *ftrace_find_event(int type) key = type & (EVENT_HASHSIZE - 1); - hlist_for_each_entry_rcu(event, n, &event_hash[key], node) { + hlist_for_each_entry(event, n, &event_hash[key], node) { if (event->type == type) return event; } @@ -472,6 +584,46 @@ struct trace_event *ftrace_find_event(int type) return NULL; } +static LIST_HEAD(ftrace_event_list); + +static int trace_search_list(struct list_head **list) +{ + struct trace_event *e; + int last = __TRACE_LAST_TYPE; + + if (list_empty(&ftrace_event_list)) { + *list = &ftrace_event_list; + return last + 1; + } + + /* + * We used up all possible max events, + * lets see if somebody freed one. + */ + list_for_each_entry(e, &ftrace_event_list, list) { + if (e->type != last + 1) + break; + last++; + } + + /* Did we used up all 65 thousand events??? */ + if ((last + 1) > FTRACE_MAX_EVENT) + return 0; + + *list = &e->list; + return last + 1; +} + +void trace_event_read_lock(void) +{ + down_read(&trace_event_mutex); +} + +void trace_event_read_unlock(void) +{ + up_read(&trace_event_mutex); +} + /** * register_ftrace_event - register output for an event type * @event: the event type to register @@ -492,22 +644,42 @@ int register_ftrace_event(struct trace_event *event) unsigned key; int ret = 0; - mutex_lock(&trace_event_mutex); + down_write(&trace_event_mutex); - if (!event) { - ret = next_event_type++; + if (WARN_ON(!event)) goto out; - } - if (!event->type) - event->type = next_event_type++; - else if (event->type > __TRACE_LAST_TYPE) { + INIT_LIST_HEAD(&event->list); + + if (!event->type) { + struct list_head *list = NULL; + + if (next_event_type > FTRACE_MAX_EVENT) { + + event->type = trace_search_list(&list); + if (!event->type) + goto out; + + } else { + + event->type = next_event_type++; + list = &ftrace_event_list; + } + + if (WARN_ON(ftrace_find_event(event->type))) + goto out; + + list_add_tail(&event->list, list); + + } else if (event->type > __TRACE_LAST_TYPE) { printk(KERN_WARNING "Need to add type to trace.h\n"); WARN_ON(1); - } - - if (ftrace_find_event(event->type)) goto out; + } else { + /* Is this event already used */ + if (ftrace_find_event(event->type)) + goto out; + } if (event->trace == NULL) event->trace = trace_nop_print; @@ -520,14 +692,25 @@ int register_ftrace_event(struct trace_event *event) key = event->type & (EVENT_HASHSIZE - 1); - hlist_add_head_rcu(&event->node, &event_hash[key]); + hlist_add_head(&event->node, &event_hash[key]); ret = event->type; out: - mutex_unlock(&trace_event_mutex); + up_write(&trace_event_mutex); return ret; } +EXPORT_SYMBOL_GPL(register_ftrace_event); + +/* + * Used by module code with the trace_event_mutex held for write. + */ +int __unregister_ftrace_event(struct trace_event *event) +{ + hlist_del(&event->node); + list_del(&event->list); + return 0; +} /** * unregister_ftrace_event - remove a no longer used event @@ -535,12 +718,13 @@ int register_ftrace_event(struct trace_event *event) */ int unregister_ftrace_event(struct trace_event *event) { - mutex_lock(&trace_event_mutex); - hlist_del(&event->node); - mutex_unlock(&trace_event_mutex); + down_write(&trace_event_mutex); + __unregister_ftrace_event(event); + up_write(&trace_event_mutex); return 0; } +EXPORT_SYMBOL_GPL(unregister_ftrace_event); /* * Standard events @@ -833,14 +1017,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); + if (!trace_seq_puts(s, "<stack trace>\n")) + goto partial; for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - if (i) { - if (!trace_seq_puts(s, " <= ")) - goto partial; + if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) + break; + if (!trace_seq_puts(s, " => ")) + goto partial; - if (!seq_print_ip_sym(s, field->caller[i], flags)) - goto partial; - } + if (!seq_print_ip_sym(s, field->caller[i], flags)) + goto partial; if (!trace_seq_puts(s, "\n")) goto partial; } @@ -868,10 +1054,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!seq_print_userip_objs(field, s, flags)) + if (!trace_seq_puts(s, "<user stack trace>\n")) goto partial; - if (!trace_seq_putc(s, '\n')) + if (!seq_print_userip_objs(field, s, flags)) goto partial; return TRACE_TYPE_HANDLED; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index e0bde39..d38bec4 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -1,41 +1,17 @@ #ifndef __TRACE_EVENTS_H #define __TRACE_EVENTS_H +#include <linux/trace_seq.h> #include "trace.h" -typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, - int flags); - -struct trace_event { - struct hlist_node node; - int type; - trace_print_func trace; - trace_print_func raw; - trace_print_func hex; - trace_print_func binary; -}; - extern enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter); -extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern int -trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); -extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, - size_t cnt); -extern int trace_seq_puts(struct trace_seq *s, const char *str); -extern int trace_seq_putc(struct trace_seq *s, unsigned char c); -extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len); -extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, - size_t len); -extern void *trace_seq_reserve(struct trace_seq *s, size_t len); -extern int trace_seq_path(struct trace_seq *s, struct path *path); extern int seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, unsigned long sym_flags); extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, @@ -44,13 +20,17 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); +extern void trace_event_read_lock(void); +extern void trace_event_read_unlock(void); extern struct trace_event *ftrace_find_event(int type); -extern int register_ftrace_event(struct trace_event *event); -extern int unregister_ftrace_event(struct trace_event *event); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); +/* used by module unregistering */ +extern int __unregister_ftrace_event(struct trace_event *event); +extern struct rw_semaphore trace_event_mutex; + #define MAX_MEMHEX_BYTES 8 #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index 1184397..8a30d98 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type, static void probe_power_end(struct power_trace *it) { + struct ftrace_event_call *call = &event_power; struct ring_buffer_event *event; struct trace_power *entry; struct trace_array_cpu *data; @@ -54,7 +55,8 @@ static void probe_power_end(struct power_trace *it) goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); } @@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it) static void probe_power_mark(struct power_trace *it, unsigned int type, unsigned int level) { + struct ftrace_event_call *call = &event_power; struct ring_buffer_event *event; struct trace_power *entry; struct trace_array_cpu *data; @@ -84,7 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type, goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index eb81556..9bece96 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -245,17 +245,13 @@ static const struct file_operations ftrace_formats_fops = { static __init int init_trace_printk_function_export(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); if (!d_tracer) return 0; - entry = debugfs_create_file("printk_formats", 0444, d_tracer, + trace_create_file("printk_formats", 0444, d_tracer, NULL, &ftrace_formats_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'printk_formats' entry\n"); return 0; } diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 9117cea..a98106d 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -10,7 +10,7 @@ #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/ftrace.h> -#include <trace/sched.h> +#include <trace/events/sched.h> #include "trace.h" @@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, int cpu; int pc; - if (!sched_ref || sched_stopped) + if (unlikely(!sched_ref)) return; tracing_record_cmdline(prev); tracing_record_cmdline(next); - if (!tracer_enabled) + if (!tracer_enabled || sched_stopped) return; pc = preempt_count(); @@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) unsigned long flags; int cpu, pc; - if (!likely(tracer_enabled)) + if (unlikely(!sched_ref)) return; - pc = preempt_count(); tracing_record_cmdline(current); - if (sched_stopped) + if (!tracer_enabled || sched_stopped) return; + pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); data = ctx_trace->data[cpu]; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 5bc00e8f..eacb272 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,7 +15,7 @@ #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/ftrace.h> -#include <trace/sched.h> +#include <trace/events/sched.h> #include "trace.h" @@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, pc = preempt_count(); - /* The task we are waiting for is waking up */ - data = wakeup_trace->data[wakeup_cpu]; - /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); @@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (unlikely(!tracer_enabled || next != wakeup_task)) goto out_unlock; + /* The task we are waiting for is waking up */ + data = wakeup_trace->data[wakeup_cpu]; + trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 08f4eb2..00dd648 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -16,6 +16,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_BRANCH: case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: + case TRACE_HW_BRANCHES: return 1; } return 0; @@ -188,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, #else # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) #endif /* CONFIG_DYNAMIC_FTRACE */ + /* * Simple verification test of ftrace function tracer. * Enable ftrace, sleep 1/10 second, and then read the trace @@ -749,3 +751,59 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) return ret; } #endif /* CONFIG_BRANCH_TRACER */ + +#ifdef CONFIG_HW_BRANCH_TRACER +int +trace_selftest_startup_hw_branches(struct tracer *trace, + struct trace_array *tr) +{ + struct trace_iterator *iter; + struct tracer tracer; + unsigned long count; + int ret; + + if (!trace->open) { + printk(KERN_CONT "missing open function..."); + return -1; + } + + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* + * The hw-branch tracer needs to collect the trace from the various + * cpu trace buffers - before tracing is stopped. + */ + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + memcpy(&tracer, trace, sizeof(tracer)); + + iter->trace = &tracer; + iter->tr = tr; + iter->pos = -1; + mutex_init(&iter->mutex); + + trace->open(iter); + + mutex_destroy(&iter->mutex); + kfree(iter); + + tracing_stop(); + + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT "no entries found.."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_HW_BRANCH_TRACER */ diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c750f65..2d7aebd 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries); + max_stack_trace.nr_entries - 1); if (!stack_tracer_enabled && !max_stack_size) print_disabled(m); @@ -352,19 +352,14 @@ __setup("stacktrace", enable_stacktrace); static __init int stack_trace_init(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("stack_max_size", 0644, d_tracer, - &max_stack_size, &stack_max_size_fops); - if (!entry) - pr_warning("Could not create debugfs 'stack_max_size' entry\n"); + trace_create_file("stack_max_size", 0644, d_tracer, + &max_stack_size, &stack_max_size_fops); - entry = debugfs_create_file("stack_trace", 0444, d_tracer, - NULL, &stack_trace_fops); - if (!entry) - pr_warning("Could not create debugfs 'stack_trace' entry\n"); + trace_create_file("stack_trace", 0444, d_tracer, + NULL, &stack_trace_fops); if (stack_tracer_enabled) register_ftrace_function(&trace_ops); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index acdebd7..c006437 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -1,7 +1,7 @@ /* * Infrastructure for statistic tracing (histogram output). * - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> + * Copyright (C) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com> * * Based on the code from trace_branch.c which is * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> @@ -10,22 +10,27 @@ #include <linux/list.h> +#include <linux/rbtree.h> #include <linux/debugfs.h> #include "trace_stat.h" #include "trace.h" -/* List of stat entries from a tracer */ -struct trace_stat_list { - struct list_head list; +/* + * List of stat red-black nodes from a tracer + * We use a such tree to sort quickly the stat + * entries from the tracer. + */ +struct stat_node { + struct rb_node node; void *stat; }; /* A stat session is the stats output in one file */ -struct tracer_stat_session { +struct stat_session { struct list_head session_list; struct tracer_stat *ts; - struct list_head stat_list; + struct rb_root stat_root; struct mutex stat_mutex; struct dentry *file; }; @@ -37,18 +42,48 @@ static DEFINE_MUTEX(all_stat_sessions_mutex); /* The root directory for all stat files */ static struct dentry *stat_dir; +/* + * Iterate through the rbtree using a post order traversal path + * to release the next node. + * It won't necessary release one at each iteration + * but it will at least advance closer to the next one + * to be released. + */ +static struct rb_node *release_next(struct rb_node *node) +{ + struct stat_node *snode; + struct rb_node *parent = rb_parent(node); + + if (node->rb_left) + return node->rb_left; + else if (node->rb_right) + return node->rb_right; + else { + if (!parent) + ; + else if (parent->rb_left == node) + parent->rb_left = NULL; + else + parent->rb_right = NULL; + + snode = container_of(node, struct stat_node, node); + kfree(snode); + + return parent; + } +} -static void reset_stat_session(struct tracer_stat_session *session) +static void reset_stat_session(struct stat_session *session) { - struct trace_stat_list *node, *next; + struct rb_node *node = session->stat_root.rb_node; - list_for_each_entry_safe(node, next, &session->stat_list, list) - kfree(node); + while (node) + node = release_next(node); - INIT_LIST_HEAD(&session->stat_list); + session->stat_root = RB_ROOT; } -static void destroy_session(struct tracer_stat_session *session) +static void destroy_session(struct stat_session *session) { debugfs_remove(session->file); reset_stat_session(session); @@ -56,25 +91,60 @@ static void destroy_session(struct tracer_stat_session *session) kfree(session); } +typedef int (*cmp_stat_t)(void *, void *); + +static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct stat_node *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->stat = stat; + + /* + * Figure out where to put new node + * This is a descendent sorting + */ + while (*new) { + struct stat_node *this; + int result; + + this = container_of(*new, struct stat_node, node); + result = cmp(data->stat, this->stat); + + parent = *new; + if (result >= 0) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); + return 0; +} + /* * For tracers that don't provide a stat_cmp callback. - * This one will force an immediate insertion on tail of - * the list. + * This one will force an insertion as right-most node + * in the rbtree. */ static int dummy_cmp(void *p1, void *p2) { - return 1; + return -1; } /* - * Initialize the stat list at each trace_stat file opening. + * Initialize the stat rbtree at each trace_stat file opening. * All of these copies and sorting are required on all opening * since the stats could have changed between two file sessions. */ -static int stat_seq_init(struct tracer_stat_session *session) +static int stat_seq_init(struct stat_session *session) { - struct trace_stat_list *iter_entry, *new_entry; struct tracer_stat *ts = session->ts; + struct rb_root *root = &session->stat_root; void *stat; int ret = 0; int i; @@ -85,29 +155,16 @@ static int stat_seq_init(struct tracer_stat_session *session) if (!ts->stat_cmp) ts->stat_cmp = dummy_cmp; - stat = ts->stat_start(); + stat = ts->stat_start(ts); if (!stat) goto exit; - /* - * The first entry. Actually this is the second, but the first - * one (the stat_list head) is pointless. - */ - new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); - if (!new_entry) { - ret = -ENOMEM; + ret = insert_stat(root, stat, ts->stat_cmp); + if (ret) goto exit; - } - - INIT_LIST_HEAD(&new_entry->list); - - list_add(&new_entry->list, &session->stat_list); - - new_entry->stat = stat; /* - * Iterate over the tracer stat entries and store them in a sorted - * list. + * Iterate over the tracer stat entries and store them in an rbtree. */ for (i = 1; ; i++) { stat = ts->stat_next(stat, i); @@ -116,36 +173,16 @@ static int stat_seq_init(struct tracer_stat_session *session) if (!stat) break; - new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); - if (!new_entry) { - ret = -ENOMEM; - goto exit_free_list; - } - - INIT_LIST_HEAD(&new_entry->list); - new_entry->stat = stat; - - list_for_each_entry_reverse(iter_entry, &session->stat_list, - list) { - - /* Insertion with a descendent sorting */ - if (ts->stat_cmp(iter_entry->stat, - new_entry->stat) >= 0) { - - list_add(&new_entry->list, &iter_entry->list); - break; - } - } - - /* The current larger value */ - if (list_empty(&new_entry->list)) - list_add(&new_entry->list, &session->stat_list); + ret = insert_stat(root, stat, ts->stat_cmp); + if (ret) + goto exit_free_rbtree; } + exit: mutex_unlock(&session->stat_mutex); return ret; -exit_free_list: +exit_free_rbtree: reset_stat_session(session); mutex_unlock(&session->stat_mutex); return ret; @@ -154,38 +191,51 @@ exit_free_list: static void *stat_seq_start(struct seq_file *s, loff_t *pos) { - struct tracer_stat_session *session = s->private; + struct stat_session *session = s->private; + struct rb_node *node; + int i; - /* Prevent from tracer switch or stat_list modification */ + /* Prevent from tracer switch or rbtree modification */ mutex_lock(&session->stat_mutex); /* If we are in the beginning of the file, print the headers */ - if (!*pos && session->ts->stat_headers) + if (!*pos && session->ts->stat_headers) { + (*pos)++; return SEQ_START_TOKEN; + } - return seq_list_start(&session->stat_list, *pos); + node = rb_first(&session->stat_root); + for (i = 0; node && i < *pos; i++) + node = rb_next(node); + + (*pos)++; + + return node; } static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) { - struct tracer_stat_session *session = s->private; + struct stat_session *session = s->private; + struct rb_node *node = p; + + (*pos)++; if (p == SEQ_START_TOKEN) - return seq_list_start(&session->stat_list, *pos); + return rb_first(&session->stat_root); - return seq_list_next(p, &session->stat_list, pos); + return rb_next(node); } static void stat_seq_stop(struct seq_file *s, void *p) { - struct tracer_stat_session *session = s->private; + struct stat_session *session = s->private; mutex_unlock(&session->stat_mutex); } static int stat_seq_show(struct seq_file *s, void *v) { - struct tracer_stat_session *session = s->private; - struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); + struct stat_session *session = s->private; + struct stat_node *l = container_of(v, struct stat_node, node); if (v == SEQ_START_TOKEN) return session->ts->stat_headers(s); @@ -205,7 +255,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file) { int ret; - struct tracer_stat_session *session = inode->i_private; + struct stat_session *session = inode->i_private; ret = seq_open(file, &trace_stat_seq_ops); if (!ret) { @@ -218,11 +268,11 @@ static int tracing_stat_open(struct inode *inode, struct file *file) } /* - * Avoid consuming memory with our now useless list. + * Avoid consuming memory with our now useless rbtree. */ static int tracing_stat_release(struct inode *i, struct file *f) { - struct tracer_stat_session *session = i->i_private; + struct stat_session *session = i->i_private; mutex_lock(&session->stat_mutex); reset_stat_session(session); @@ -251,7 +301,7 @@ static int tracing_stat_init(void) return 0; } -static int init_stat_file(struct tracer_stat_session *session) +static int init_stat_file(struct stat_session *session) { if (!stat_dir && tracing_stat_init()) return -ENODEV; @@ -266,7 +316,7 @@ static int init_stat_file(struct tracer_stat_session *session) int register_stat_tracer(struct tracer_stat *trace) { - struct tracer_stat_session *session, *node, *tmp; + struct stat_session *session, *node; int ret; if (!trace) @@ -277,7 +327,7 @@ int register_stat_tracer(struct tracer_stat *trace) /* Already registered? */ mutex_lock(&all_stat_sessions_mutex); - list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { + list_for_each_entry(node, &all_stat_sessions, session_list) { if (node->ts == trace) { mutex_unlock(&all_stat_sessions_mutex); return -EINVAL; @@ -286,15 +336,13 @@ int register_stat_tracer(struct tracer_stat *trace) mutex_unlock(&all_stat_sessions_mutex); /* Init the session */ - session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL); + session = kzalloc(sizeof(*session), GFP_KERNEL); if (!session) return -ENOMEM; session->ts = trace; INIT_LIST_HEAD(&session->session_list); - INIT_LIST_HEAD(&session->stat_list); mutex_init(&session->stat_mutex); - session->file = NULL; ret = init_stat_file(session); if (ret) { @@ -312,7 +360,7 @@ int register_stat_tracer(struct tracer_stat *trace) void unregister_stat_tracer(struct tracer_stat *trace) { - struct tracer_stat_session *node, *tmp; + struct stat_session *node, *tmp; mutex_lock(&all_stat_sessions_mutex); list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 202274c..f3546a2 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -12,7 +12,7 @@ struct tracer_stat { /* The name of your stat file */ const char *name; /* Iteration over statistic entries */ - void *(*stat_start)(void); + void *(*stat_start)(struct tracer_stat *trace); void *(*stat_next)(void *prev, int idx); /* Compare two entries for stats sorting */ int (*stat_cmp)(void *p1, void *p2); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 91fd19c..e04b76c 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -321,11 +321,7 @@ static const struct file_operations sysprof_sample_fops = { void init_tracer_sysprof_debugfs(struct dentry *d_tracer) { - struct dentry *entry; - entry = debugfs_create_file("sysprof_sample_period", 0644, + trace_create_file("sysprof_sample_period", 0644, d_tracer, NULL, &sysprof_sample_fops); - if (entry) - return; - pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n"); } diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 797201e..97fcea4 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -6,7 +6,7 @@ */ -#include <trace/workqueue.h> +#include <trace/events/workqueue.h> #include <linux/list.h> #include <linux/percpu.h> #include "trace_stat.h" @@ -16,8 +16,6 @@ /* A cpu workqueue thread */ struct cpu_workqueue_stats { struct list_head list; -/* Useful to know if we print the cpu headers */ - bool first_entry; int cpu; pid_t pid; /* Can be inserted from interrupt or user context, need to be atomic */ @@ -47,12 +45,11 @@ probe_workqueue_insertion(struct task_struct *wq_thread, struct work_struct *work) { int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node, *next; + struct cpu_workqueue_stats *node; unsigned long flags; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, - list) { + list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { if (node->pid == wq_thread->pid) { atomic_inc(&node->inserted); goto found; @@ -69,12 +66,11 @@ probe_workqueue_execution(struct task_struct *wq_thread, struct work_struct *work) { int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node, *next; + struct cpu_workqueue_stats *node; unsigned long flags; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, - list) { + list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { if (node->pid == wq_thread->pid) { node->executed++; goto found; @@ -105,8 +101,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) cws->pid = wq_thread->pid; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (list_empty(&workqueue_cpu_stat(cpu)->list)) - cws->first_entry = true; list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); } @@ -152,7 +146,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) return ret; } -static void *workqueue_stat_start(void) +static void *workqueue_stat_start(struct tracer_stat *trace) { int cpu; void *ret = NULL; @@ -191,16 +185,9 @@ static void *workqueue_stat_next(void *prev, int idx) static int workqueue_stat_show(struct seq_file *s, void *p) { struct cpu_workqueue_stats *cws = p; - unsigned long flags; - int cpu = cws->cpu; struct pid *pid; struct task_struct *tsk; - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (&cws->list == workqueue_cpu_stat(cpu)->list.next) - seq_printf(s, "\n"); - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - pid = find_get_pid(cws->pid); if (pid) { tsk = get_pid_task(pid, PIDTYPE_PID); diff --git a/kernel/wait.c b/kernel/wait.c index 42a2dbc..ea7c3b4 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_common(q, mode, 1, 0, key); + __wake_up_locked_key(q, mode, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f71fb2a..0668795 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -33,7 +33,8 @@ #include <linux/kallsyms.h> #include <linux/debug_locks.h> #include <linux/lockdep.h> -#include <trace/workqueue.h> +#define CREATE_TRACE_POINTS +#include <trace/events/workqueue.h> /* * The per-CPU workqueue (if single thread, we always use the first @@ -124,8 +125,6 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); } -DEFINE_TRACE(workqueue_insertion); - static void insert_work(struct cpu_workqueue_struct *cwq, struct work_struct *work, struct list_head *head) { @@ -262,8 +261,6 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work_on); -DEFINE_TRACE(workqueue_execution); - static void run_workqueue(struct cpu_workqueue_struct *cwq) { spin_lock_irq(&cwq->lock); @@ -753,8 +750,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu) return cwq; } -DEFINE_TRACE(workqueue_creation); - static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; @@ -860,8 +855,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name, } EXPORT_SYMBOL_GPL(__create_workqueue_key); -DEFINE_TRACE(workqueue_destruction); - static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) { /* |