diff options
author | Frederic Weisbecker <fweisbec@gmail.com> | 2009-09-23 23:08:43 +0200 |
---|---|---|
committer | Frederic Weisbecker <fweisbec@gmail.com> | 2009-09-23 23:08:43 +0200 |
commit | d7a4b414eed51f1653bb05ebe84122bf9a7ae18b (patch) | |
tree | bd6603a0c27de4c138a1767871897e9cd3e1a1d2 /kernel | |
parent | 1f0ab40976460bc4673fa204ce917a725185d8f2 (diff) | |
parent | a724eada8c2a7b62463b73ccf73fd0bb6e928aeb (diff) | |
download | op-kernel-dev-d7a4b414eed51f1653bb05ebe84122bf9a7ae18b.zip op-kernel-dev-d7a4b414eed51f1653bb05ebe84122bf9a7ae18b.tar.gz |
Merge commit 'linus/master' into tracing/kprobes
Conflicts:
kernel/trace/Makefile
kernel/trace/trace.h
kernel/trace/trace_event_types.h
kernel/trace/trace_export.c
Merge reason:
Sync with latest significant tracing core changes.
Diffstat (limited to 'kernel')
106 files changed, 12088 insertions, 12648 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 2093a69..187c89b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -80,26 +80,22 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o -obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o obj-$(CONFIG_TREE_RCU) += rcutree.o -obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o +obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o -obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o -obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o -obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is @@ -119,7 +115,7 @@ $(obj)/config_data.gz: .config FORCE $(call if_changed,gzip) quiet_cmd_ikconfiggz = IKCFG $@ - cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ + cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call if_changed,ikconfiggz) diff --git a/kernel/acct.c b/kernel/acct.c index 9f33910..9a4715a 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -491,13 +491,17 @@ static void do_acct_process(struct bsd_acct_struct *acct, u64 run_time; struct timespec uptime; struct tty_struct *tty; + const struct cred *orig_cred; + + /* Perform file operations on behalf of whoever enabled accounting */ + orig_cred = override_creds(file->f_cred); /* * First check to see if there is enough free_space to continue * the process accounting system. */ if (!check_free_space(acct, file)) - return; + goto out; /* * Fill the accounting struct with the needed info as recorded @@ -578,6 +582,8 @@ static void do_acct_process(struct bsd_acct_struct *acct, sizeof(acct_t), &file->f_pos); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; set_fs(fs); +out: + revert_creds(orig_cred); } /** diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b6eadfe..cd83d99 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -596,10 +596,11 @@ void cgroup_unlock(void) static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); -static struct inode_operations cgroup_dir_inode_operations; +static const struct inode_operations cgroup_dir_inode_operations; static struct file_operations proc_cgroupstats_operations; static struct backing_dev_info cgroup_backing_dev_info = { + .name = "cgroup", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; @@ -960,7 +961,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) return ret; } -static struct super_operations cgroup_ops = { +static const struct super_operations cgroup_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = cgroup_show_options, @@ -1710,7 +1711,7 @@ static struct file_operations cgroup_file_operations = { .release = cgroup_file_release, }; -static struct inode_operations cgroup_dir_inode_operations = { +static const struct inode_operations cgroup_dir_inode_operations = { .lookup = simple_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, @@ -2313,7 +2314,7 @@ static int cgroup_tasks_show(struct seq_file *s, void *v) return seq_printf(s, "%d\n", *(int *)v); } -static struct seq_operations cgroup_tasks_seq_operations = { +static const struct seq_operations cgroup_tasks_seq_operations = { .start = cgroup_tasks_start, .stop = cgroup_tasks_stop, .next = cgroup_tasks_next, diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ce1004..6ba0f1e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -401,6 +401,7 @@ int disable_nonboot_cpus(void) break; } } + if (!error) { BUG_ON(num_online_cpus() > 1); /* Make sure the CPUs won't be enabled by someone else */ @@ -413,6 +414,14 @@ int disable_nonboot_cpus(void) return error; } +void __weak arch_enable_nonboot_cpus_begin(void) +{ +} + +void __weak arch_enable_nonboot_cpus_end(void) +{ +} + void __ref enable_nonboot_cpus(void) { int cpu, error; @@ -424,6 +433,9 @@ void __ref enable_nonboot_cpus(void) goto out; printk("Enabling non-boot CPUs ...\n"); + + arch_enable_nonboot_cpus_begin(); + for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { @@ -432,6 +444,9 @@ void __ref enable_nonboot_cpus(void) } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); } + + arch_enable_nonboot_cpus_end(); + cpumask_clear(frozen_cpus); out: cpu_maps_update_done(); diff --git a/kernel/cred.c b/kernel/cred.c index 1bb4d7e..d7f7a01 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -18,6 +18,18 @@ #include <linux/cn_proc.h> #include "cred-internals.h" +#if 0 +#define kdebug(FMT, ...) \ + printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) +#else +static inline __attribute__((format(printf, 1, 2))) +void no_printk(const char *fmt, ...) +{ +} +#define kdebug(FMT, ...) \ + no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) +#endif + static struct kmem_cache *cred_jar; /* @@ -36,6 +48,10 @@ static struct thread_group_cred init_tgcred = { */ struct cred init_cred = { .usage = ATOMIC_INIT(4), +#ifdef CONFIG_DEBUG_CREDENTIALS + .subscribers = ATOMIC_INIT(2), + .magic = CRED_MAGIC, +#endif .securebits = SECUREBITS_DEFAULT, .cap_inheritable = CAP_INIT_INH_SET, .cap_permitted = CAP_FULL_SET, @@ -48,6 +64,31 @@ struct cred init_cred = { #endif }; +static inline void set_cred_subscribers(struct cred *cred, int n) +{ +#ifdef CONFIG_DEBUG_CREDENTIALS + atomic_set(&cred->subscribers, n); +#endif +} + +static inline int read_cred_subscribers(const struct cred *cred) +{ +#ifdef CONFIG_DEBUG_CREDENTIALS + return atomic_read(&cred->subscribers); +#else + return 0; +#endif +} + +static inline void alter_cred_subscribers(const struct cred *_cred, int n) +{ +#ifdef CONFIG_DEBUG_CREDENTIALS + struct cred *cred = (struct cred *) _cred; + + atomic_add(n, &cred->subscribers); +#endif +} + /* * Dispose of the shared task group credentials */ @@ -85,15 +126,29 @@ static void put_cred_rcu(struct rcu_head *rcu) { struct cred *cred = container_of(rcu, struct cred, rcu); + kdebug("put_cred_rcu(%p)", cred); + +#ifdef CONFIG_DEBUG_CREDENTIALS + if (cred->magic != CRED_MAGIC_DEAD || + atomic_read(&cred->usage) != 0 || + read_cred_subscribers(cred) != 0) + panic("CRED: put_cred_rcu() sees %p with" + " mag %x, put %p, usage %d, subscr %d\n", + cred, cred->magic, cred->put_addr, + atomic_read(&cred->usage), + read_cred_subscribers(cred)); +#else if (atomic_read(&cred->usage) != 0) panic("CRED: put_cred_rcu() sees %p with usage %d\n", cred, atomic_read(&cred->usage)); +#endif security_cred_free(cred); key_put(cred->thread_keyring); key_put(cred->request_key_auth); release_tgcred(cred); - put_group_info(cred->group_info); + if (cred->group_info) + put_group_info(cred->group_info); free_uid(cred->user); kmem_cache_free(cred_jar, cred); } @@ -106,12 +161,90 @@ static void put_cred_rcu(struct rcu_head *rcu) */ void __put_cred(struct cred *cred) { + kdebug("__put_cred(%p{%d,%d})", cred, + atomic_read(&cred->usage), + read_cred_subscribers(cred)); + BUG_ON(atomic_read(&cred->usage) != 0); +#ifdef CONFIG_DEBUG_CREDENTIALS + BUG_ON(read_cred_subscribers(cred) != 0); + cred->magic = CRED_MAGIC_DEAD; + cred->put_addr = __builtin_return_address(0); +#endif + BUG_ON(cred == current->cred); + BUG_ON(cred == current->real_cred); call_rcu(&cred->rcu, put_cred_rcu); } EXPORT_SYMBOL(__put_cred); +/* + * Clean up a task's credentials when it exits + */ +void exit_creds(struct task_struct *tsk) +{ + struct cred *cred; + + kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred, + atomic_read(&tsk->cred->usage), + read_cred_subscribers(tsk->cred)); + + cred = (struct cred *) tsk->real_cred; + tsk->real_cred = NULL; + validate_creds(cred); + alter_cred_subscribers(cred, -1); + put_cred(cred); + + cred = (struct cred *) tsk->cred; + tsk->cred = NULL; + validate_creds(cred); + alter_cred_subscribers(cred, -1); + put_cred(cred); + + cred = (struct cred *) tsk->replacement_session_keyring; + if (cred) { + tsk->replacement_session_keyring = NULL; + validate_creds(cred); + put_cred(cred); + } +} + +/* + * Allocate blank credentials, such that the credentials can be filled in at a + * later date without risk of ENOMEM. + */ +struct cred *cred_alloc_blank(void) +{ + struct cred *new; + + new = kmem_cache_zalloc(cred_jar, GFP_KERNEL); + if (!new) + return NULL; + +#ifdef CONFIG_KEYS + new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); + if (!new->tgcred) { + kfree(new); + return NULL; + } + atomic_set(&new->tgcred->usage, 1); +#endif + + atomic_set(&new->usage, 1); + + if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) + goto error; + +#ifdef CONFIG_DEBUG_CREDENTIALS + new->magic = CRED_MAGIC; +#endif + return new; + +error: + abort_creds(new); + return NULL; +} + /** * prepare_creds - Prepare a new set of credentials for modification * @@ -132,16 +265,19 @@ struct cred *prepare_creds(void) const struct cred *old; struct cred *new; - BUG_ON(atomic_read(&task->real_cred->usage) < 1); + validate_process_creds(); new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; + kdebug("prepare_creds() alloc %p", new); + old = task->cred; memcpy(new, old, sizeof(struct cred)); atomic_set(&new->usage, 1); + set_cred_subscribers(new, 0); get_group_info(new->group_info); get_uid(new->user); @@ -157,6 +293,7 @@ struct cred *prepare_creds(void) if (security_prepare_creds(new, old, GFP_KERNEL) < 0) goto error; + validate_creds(new); return new; error: @@ -229,9 +366,12 @@ struct cred *prepare_usermodehelper_creds(void) if (!new) return NULL; + kdebug("prepare_usermodehelper_creds() alloc %p", new); + memcpy(new, &init_cred, sizeof(struct cred)); atomic_set(&new->usage, 1); + set_cred_subscribers(new, 0); get_group_info(new->group_info); get_uid(new->user); @@ -250,6 +390,7 @@ struct cred *prepare_usermodehelper_creds(void) #endif if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0) goto error; + validate_creds(new); BUG_ON(atomic_read(&new->usage) != 1); return new; @@ -286,6 +427,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) ) { p->real_cred = get_cred(p->cred); get_cred(p->cred); + alter_cred_subscribers(p->cred, 2); + kdebug("share_creds(%p{%d,%d})", + p->cred, atomic_read(&p->cred->usage), + read_cred_subscribers(p->cred)); atomic_inc(&p->cred->user->processes); return 0; } @@ -331,6 +476,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) atomic_inc(&new->user->processes); p->cred = p->real_cred = get_cred(new); + alter_cred_subscribers(new, 2); + validate_creds(new); return 0; error_put: @@ -355,13 +502,20 @@ error_put: int commit_creds(struct cred *new) { struct task_struct *task = current; - const struct cred *old; + const struct cred *old = task->real_cred; - BUG_ON(task->cred != task->real_cred); - BUG_ON(atomic_read(&task->real_cred->usage) < 2); + kdebug("commit_creds(%p{%d,%d})", new, + atomic_read(&new->usage), + read_cred_subscribers(new)); + + BUG_ON(task->cred != old); +#ifdef CONFIG_DEBUG_CREDENTIALS + BUG_ON(read_cred_subscribers(old) < 2); + validate_creds(old); + validate_creds(new); +#endif BUG_ON(atomic_read(&new->usage) < 1); - old = task->real_cred; security_commit_creds(new, old); get_cred(new); /* we will require a ref for the subj creds too */ @@ -390,12 +544,14 @@ int commit_creds(struct cred *new) * cheaply with the new uid cache, so if it matters * we should be checking for it. -DaveM */ + alter_cred_subscribers(new, 2); if (new->user != old->user) atomic_inc(&new->user->processes); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user) atomic_dec(&old->user->processes); + alter_cred_subscribers(old, -2); sched_switch_user(task); @@ -428,6 +584,13 @@ EXPORT_SYMBOL(commit_creds); */ void abort_creds(struct cred *new) { + kdebug("abort_creds(%p{%d,%d})", new, + atomic_read(&new->usage), + read_cred_subscribers(new)); + +#ifdef CONFIG_DEBUG_CREDENTIALS + BUG_ON(read_cred_subscribers(new) != 0); +#endif BUG_ON(atomic_read(&new->usage) < 1); put_cred(new); } @@ -444,7 +607,20 @@ const struct cred *override_creds(const struct cred *new) { const struct cred *old = current->cred; - rcu_assign_pointer(current->cred, get_cred(new)); + kdebug("override_creds(%p{%d,%d})", new, + atomic_read(&new->usage), + read_cred_subscribers(new)); + + validate_creds(old); + validate_creds(new); + get_cred(new); + alter_cred_subscribers(new, 1); + rcu_assign_pointer(current->cred, new); + alter_cred_subscribers(old, -1); + + kdebug("override_creds() = %p{%d,%d}", old, + atomic_read(&old->usage), + read_cred_subscribers(old)); return old; } EXPORT_SYMBOL(override_creds); @@ -460,7 +636,15 @@ void revert_creds(const struct cred *old) { const struct cred *override = current->cred; + kdebug("revert_creds(%p{%d,%d})", old, + atomic_read(&old->usage), + read_cred_subscribers(old)); + + validate_creds(old); + validate_creds(override); + alter_cred_subscribers(old, 1); rcu_assign_pointer(current->cred, old); + alter_cred_subscribers(override, -1); put_cred(override); } EXPORT_SYMBOL(revert_creds); @@ -502,11 +686,15 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) if (!new) return NULL; + kdebug("prepare_kernel_cred() alloc %p", new); + if (daemon) old = get_task_cred(daemon); else old = get_cred(&init_cred); + validate_creds(old); + *new = *old; get_uid(new->user); get_group_info(new->group_info); @@ -526,7 +714,9 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) goto error; atomic_set(&new->usage, 1); + set_cred_subscribers(new, 0); put_cred(old); + validate_creds(new); return new; error: @@ -589,3 +779,95 @@ int set_create_files_as(struct cred *new, struct inode *inode) return security_kernel_create_files_as(new, inode); } EXPORT_SYMBOL(set_create_files_as); + +#ifdef CONFIG_DEBUG_CREDENTIALS + +/* + * dump invalid credentials + */ +static void dump_invalid_creds(const struct cred *cred, const char *label, + const struct task_struct *tsk) +{ + printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n", + label, cred, + cred == &init_cred ? "[init]" : "", + cred == tsk->real_cred ? "[real]" : "", + cred == tsk->cred ? "[eff]" : ""); + printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n", + cred->magic, cred->put_addr); + printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n", + atomic_read(&cred->usage), + read_cred_subscribers(cred)); + printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", + cred->uid, cred->euid, cred->suid, cred->fsuid); + printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", + cred->gid, cred->egid, cred->sgid, cred->fsgid); +#ifdef CONFIG_SECURITY + printk(KERN_ERR "CRED: ->security is %p\n", cred->security); + if ((unsigned long) cred->security >= PAGE_SIZE && + (((unsigned long) cred->security & 0xffffff00) != + (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))) + printk(KERN_ERR "CRED: ->security {%x, %x}\n", + ((u32*)cred->security)[0], + ((u32*)cred->security)[1]); +#endif +} + +/* + * report use of invalid credentials + */ +void __invalid_creds(const struct cred *cred, const char *file, unsigned line) +{ + printk(KERN_ERR "CRED: Invalid credentials\n"); + printk(KERN_ERR "CRED: At %s:%u\n", file, line); + dump_invalid_creds(cred, "Specified", current); + BUG(); +} +EXPORT_SYMBOL(__invalid_creds); + +/* + * check the credentials on a process + */ +void __validate_process_creds(struct task_struct *tsk, + const char *file, unsigned line) +{ + if (tsk->cred == tsk->real_cred) { + if (unlikely(read_cred_subscribers(tsk->cred) < 2 || + creds_are_invalid(tsk->cred))) + goto invalid_creds; + } else { + if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 || + read_cred_subscribers(tsk->cred) < 1 || + creds_are_invalid(tsk->real_cred) || + creds_are_invalid(tsk->cred))) + goto invalid_creds; + } + return; + +invalid_creds: + printk(KERN_ERR "CRED: Invalid process credentials\n"); + printk(KERN_ERR "CRED: At %s:%u\n", file, line); + + dump_invalid_creds(tsk->real_cred, "Real", tsk); + if (tsk->cred != tsk->real_cred) + dump_invalid_creds(tsk->cred, "Effective", tsk); + else + printk(KERN_ERR "CRED: Effective creds == Real creds\n"); + BUG(); +} +EXPORT_SYMBOL(__validate_process_creds); + +/* + * check creds for do_exit() + */ +void validate_creds_for_do_exit(struct task_struct *tsk) +{ + kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})", + tsk->real_cred, tsk->cred, + atomic_read(&tsk->cred->usage), + read_cred_subscribers(tsk->cred)); + + __validate_process_creds(tsk, __FILE__, __LINE__); +} + +#endif /* CONFIG_DEBUG_CREDENTIALS */ diff --git a/kernel/delayacct.c b/kernel/delayacct.c index abb6e17..ead9b61 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -15,6 +15,7 @@ #include <linux/sched.h> #include <linux/slab.h> +#include <linux/taskstats.h> #include <linux/time.h> #include <linux/sysctl.h> #include <linux/delayacct.h> diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c deleted file mode 100644 index 962a3b5..0000000 --- a/kernel/dma-coherent.c +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Coherent per-device memory handling. - * Borrowed from i386 - */ -#include <linux/kernel.h> -#include <linux/dma-mapping.h> - -struct dma_coherent_mem { - void *virt_base; - u32 device_base; - int size; - int flags; - unsigned long *bitmap; -}; - -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void __iomem *mem_base = NULL; - int pages = size >> PAGE_SHIFT; - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); - - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) - goto out; - if (!size) - goto out; - if (dev->dma_mem) - goto out; - - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ - - mem_base = ioremap(bus_addr, size); - if (!mem_base) - goto out; - - dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dev->dma_mem) - goto out; - dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!dev->dma_mem->bitmap) - goto free1_out; - - dev->dma_mem->virt_base = mem_base; - dev->dma_mem->device_base = device_addr; - dev->dma_mem->size = pages; - dev->dma_mem->flags = flags; - - if (flags & DMA_MEMORY_MAP) - return DMA_MEMORY_MAP; - - return DMA_MEMORY_IO; - - free1_out: - kfree(dev->dma_mem); - out: - if (mem_base) - iounmap(mem_base); - return 0; -} -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if (!mem) - return; - dev->dma_mem = NULL; - iounmap(mem->virt_base); - kfree(mem->bitmap); - kfree(mem); -} -EXPORT_SYMBOL(dma_release_declared_memory); - -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - int pos, err; - - size += device_addr & ~PAGE_MASK; - - if (!mem) - return ERR_PTR(-EINVAL); - - pos = (device_addr - mem->device_base) >> PAGE_SHIFT; - err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - -/** - * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area - * - * @dev: device from which we allocate memory - * @size: size of requested memory area - * @dma_handle: This will be filled with the correct dma handle - * @ret: This pointer will be filled with the virtual address - * to allocated area. - * - * This function should be only called from per-arch dma_alloc_coherent() - * to support allocation from per-device coherent memory pools. - * - * Returns 0 if dma_alloc_coherent should continue with allocating from - * generic memory areas, or !0 if dma_alloc_coherent should return @ret. - */ -int dma_alloc_from_coherent(struct device *dev, ssize_t size, - dma_addr_t *dma_handle, void **ret) -{ - struct dma_coherent_mem *mem; - int order = get_order(size); - int pageno; - - if (!dev) - return 0; - mem = dev->dma_mem; - if (!mem) - return 0; - - *ret = NULL; - - if (unlikely(size > (mem->size << PAGE_SHIFT))) - goto err; - - pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); - if (unlikely(pageno < 0)) - goto err; - - /* - * Memory was found in the per-device area. - */ - *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); - *ret = mem->virt_base + (pageno << PAGE_SHIFT); - memset(*ret, 0, size); - - return 1; - -err: - /* - * In the case where the allocation can not be satisfied from the - * per-device area, try to fall back to generic memory if the - * constraints allow it. - */ - return mem->flags & DMA_MEMORY_EXCLUSIVE; -} -EXPORT_SYMBOL(dma_alloc_from_coherent); - -/** - * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool - * @dev: device from which the memory was allocated - * @order: the order of pages allocated - * @vaddr: virtual address of allocated pages - * - * This checks whether the memory was allocated from the per-device - * coherent memory pool and if so, releases that memory. - * - * Returns 1 if we correctly released the memory, or 0 if - * dma_release_coherent() should proceed with releasing memory from - * generic pools. - */ -int dma_release_from_coherent(struct device *dev, int order, void *vaddr) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - - if (mem && vaddr >= mem->virt_base && vaddr < - (mem->virt_base + (mem->size << PAGE_SHIFT))) { - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; - - bitmap_release_region(mem->bitmap, page, order); - return 1; - } - return 0; -} -EXPORT_SYMBOL(dma_release_from_coherent); diff --git a/kernel/exit.c b/kernel/exit.c index 869dc22..60d6fdc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,7 +47,7 @@ #include <linux/tracehook.h> #include <linux/fs_struct.h> #include <linux/init_task.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <trace/events/sched.h> #include <asm/uaccess.h> @@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); -#ifdef CONFIG_PERF_COUNTERS - WARN_ON_ONCE(tsk->perf_counter_ctxp); +#ifdef CONFIG_PERF_EVENTS + WARN_ON_ONCE(tsk->perf_event_ctxp); #endif trace_sched_process_free(tsk); put_task_struct(tsk); @@ -359,8 +359,10 @@ void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; - if (task_session(curr) != pid) + if (task_session(curr) != pid) { change_pid(curr, PIDTYPE_SID, pid); + proc_sid_connector(curr); + } if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); @@ -901,6 +903,8 @@ NORET_TYPE void do_exit(long code) tracehook_report_exit(&code); + validate_creds_for_do_exit(tsk); + /* * We're taking recursive faults here in do_exit. Safest is to just * leave this task alone and wait for reboot. @@ -943,6 +947,8 @@ NORET_TYPE void do_exit(long code) if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); + if (tsk->mm) + setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) @@ -979,7 +985,7 @@ NORET_TYPE void do_exit(long code) * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ - perf_counter_exit_task(tsk); + perf_event_exit_task(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA @@ -1009,7 +1015,10 @@ NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); + validate_creds_for_do_exit(tsk); + preempt_disable(); + exit_rcu(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; schedule(); @@ -1203,6 +1212,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (likely(!traced) && likely(!task_detached(p))) { struct signal_struct *psig; struct signal_struct *sig; + unsigned long maxrss; /* * The resource counters for the group leader are in its @@ -1251,6 +1261,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; + maxrss = max(sig->maxrss, sig->cmaxrss); + if (psig->cmaxrss < maxrss) + psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); spin_unlock_irq(&p->real_parent->sighand->siglock); diff --git a/kernel/fork.c b/kernel/fork.c index e6c04d4..51ad0b0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -49,6 +49,7 @@ #include <linux/ftrace.h> #include <linux/profile.h> #include <linux/rmap.h> +#include <linux/ksm.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> @@ -61,7 +62,8 @@ #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> +#include <linux/posix-timers.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -136,9 +138,17 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; +static void account_kernel_stack(struct thread_info *ti, int account) +{ + struct zone *zone = page_zone(virt_to_page(ti)); + + mod_zone_page_state(zone, NR_KERNEL_STACK, account); +} + void free_task(struct task_struct *tsk) { prop_local_destroy_single(&tsk->dirties); + account_kernel_stack(tsk->stack, -1); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); @@ -152,8 +162,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); - put_cred(tsk->real_cred); - put_cred(tsk->cred); + exit_creds(tsk); delayacct_tsk_free(tsk); if (!profile_handoff_task(tsk)) @@ -254,6 +263,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; + + account_kernel_stack(ti, 1); + return tsk; out: @@ -289,6 +301,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; + retval = ksm_fork(mm, oldmm); + if (retval) + goto out; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; @@ -425,7 +440,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; + mm->flags = (current->mm) ? + (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); @@ -486,6 +502,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); + ksm_exit(mm); exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { @@ -789,10 +806,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) thread_group_cputime_init(sig); /* Expiration times and increments. */ - sig->it_virt_expires = cputime_zero; - sig->it_virt_incr = cputime_zero; - sig->it_prof_expires = cputime_zero; - sig->it_prof_incr = cputime_zero; + sig->it[CPUCLOCK_PROF].expires = cputime_zero; + sig->it[CPUCLOCK_PROF].incr = cputime_zero; + sig->it[CPUCLOCK_VIRT].expires = cputime_zero; + sig->it[CPUCLOCK_VIRT].incr = cputime_zero; /* Cached expiration times. */ sig->cputime_expires.prof_exp = cputime_zero; @@ -850,6 +867,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; + sig->maxrss = sig->cmaxrss = 0; task_io_accounting_init(&sig->ioac); sig->sum_sched_runtime = 0; taskstats_tgid_init(sig); @@ -864,6 +882,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); + sig->oom_adj = current->signal->oom_adj; + return 0; } @@ -1008,10 +1028,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, copy_flags(clone_flags, p); INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); -#ifdef CONFIG_PREEMPT_RCU - p->rcu_read_lock_nesting = 0; - p->rcu_flipctr_idx = 0; -#endif /* #ifdef CONFIG_PREEMPT_RCU */ + rcu_copy_process(p); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); @@ -1079,10 +1096,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->bts = NULL; + p->stack_start = stack_start; + /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); - retval = perf_counter_init_task(p); + retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; @@ -1257,7 +1276,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); - perf_counter_fork(p); + perf_event_fork(p); return p; bad_fork_free_pid: @@ -1284,7 +1303,7 @@ bad_fork_cleanup_semundo: bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: - perf_counter_free_task(p); + perf_event_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: @@ -1297,8 +1316,7 @@ bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); - put_cred(p->real_cred); - put_cred(p->cred); + exit_creds(p); bad_fork_free: free_task(p); fork_out: diff --git a/kernel/futex.c b/kernel/futex.c index e18cfbd..248dd11 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -115,6 +115,9 @@ struct futex_q { /* rt_waiter storage for requeue_pi: */ struct rt_mutex_waiter *rt_waiter; + /* The expected requeue pi target futex key: */ + union futex_key *requeue_pi_key; + /* Bitset for the optional bitmasked wakeup */ u32 bitset; }; @@ -1089,6 +1092,10 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, if (!top_waiter) return 0; + /* Ensure we requeue to the expected futex. */ + if (!match_futex(top_waiter->requeue_pi_key, key2)) + return -EINVAL; + /* * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in * the contended case or if set_waiters is 1. The pi_state is returned @@ -1276,6 +1283,12 @@ retry_private: continue; } + /* Ensure we requeue to the expected futex for requeue_pi. */ + if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { + ret = -EINVAL; + break; + } + /* * Requeue nr_requeue waiters and possibly one more in the case * of requeue_pi if we couldn't acquire the lock atomically. @@ -1751,6 +1764,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, q.pi_state = NULL; q.bitset = bitset; q.rt_waiter = NULL; + q.requeue_pi_key = NULL; if (abs_time) { to = &timeout; @@ -1858,6 +1872,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, q.pi_state = NULL; q.rt_waiter = NULL; + q.requeue_pi_key = NULL; retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); @@ -2118,11 +2133,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * We call schedule in futex_wait_queue_me() when we enqueue and return there * via the following: * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() - * 2) wakeup on uaddr2 after a requeue and subsequent unlock - * 3) signal (before or after requeue) - * 4) timeout (before or after requeue) + * 2) wakeup on uaddr2 after a requeue + * 3) signal + * 4) timeout * - * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. + * If 3, cleanup and return -ERESTARTNOINTR. * * If 2, we may then block on trying to take the rt_mutex and return via: * 5) successful lock @@ -2130,7 +2145,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * 7) timeout * 8) other lock acquisition failure * - * If 6, we setup a restart_block with futex_lock_pi() as the function. + * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). * * If 4 or 7, we cleanup and return with -ETIMEDOUT. * @@ -2169,15 +2184,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, debug_rt_mutex_init_waiter(&rt_waiter); rt_waiter.task = NULL; - q.pi_state = NULL; - q.bitset = bitset; - q.rt_waiter = &rt_waiter; - key2 = FUTEX_KEY_INIT; ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) goto out; + q.pi_state = NULL; + q.bitset = bitset; + q.rt_waiter = &rt_waiter; + q.requeue_pi_key = &key2; + /* Prepare to wait on uaddr. */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) @@ -2248,14 +2264,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, rt_mutex_unlock(pi_mutex); } else if (ret == -EINTR) { /* - * We've already been requeued, but we have no way to - * restart by calling futex_lock_pi() directly. We - * could restart the syscall, but that will look at - * the user space value and return right away. So we - * drop back with EWOULDBLOCK to tell user space that - * "val" has been changed. That's the same what the - * restart of the syscall would do in - * futex_wait_setup(). + * We've already been requeued, but cannot restart by calling + * futex_lock_pi() directly. We could restart this syscall, but + * it would detect that the user space "val" changed and return + * -EWOULDBLOCK. Save the overhead of the restart and return + * -EWOULDBLOCK directly. */ ret = -EWOULDBLOCK; } diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 22e9dcf..654efd0 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -34,7 +34,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on S390 || X86 + depends on S390 || X86 || (PPC && EXPERIMENTAL) default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 49da79a..e5d98ce 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -48,36 +48,7 @@ #include <asm/uaccess.h> -/** - * ktime_get - get the monotonic time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get(void) -{ - struct timespec now; - - ktime_get_ts(&now); - - return timespec_to_ktime(now); -} -EXPORT_SYMBOL_GPL(ktime_get); - -/** - * ktime_get_real - get the real (wall-) time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get_real(void) -{ - struct timespec now; - - getnstimeofday(&now); - - return timespec_to_ktime(now); -} - -EXPORT_SYMBOL_GPL(ktime_get_real); +#include <trace/events/timer.h> /* * The timer bases: @@ -106,31 +77,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = } }; -/** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. - */ -void ktime_get_ts(struct timespec *ts) -{ - struct timespec tomono; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - getnstimeofday(ts); - tomono = wall_to_monotonic; - - } while (read_seqretry(&xtime_lock, seq)); - - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); -} -EXPORT_SYMBOL_GPL(ktime_get_ts); - /* * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. @@ -485,6 +431,7 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, debug_object_init_on_stack(timer, &hrtimer_debug_descr); __hrtimer_init(timer, clock_id, mode); } +EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); void destroy_hrtimer_on_stack(struct hrtimer *timer) { @@ -497,6 +444,26 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif +static inline void +debug_init(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) +{ + debug_hrtimer_init(timer); + trace_hrtimer_init(timer, clockid, mode); +} + +static inline void debug_activate(struct hrtimer *timer) +{ + debug_hrtimer_activate(timer); + trace_hrtimer_start(timer); +} + +static inline void debug_deactivate(struct hrtimer *timer) +{ + debug_hrtimer_deactivate(timer); + trace_hrtimer_cancel(timer); +} + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -853,7 +820,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, struct hrtimer *entry; int leftmost = 1; - debug_hrtimer_activate(timer); + debug_activate(timer); /* * Find the right place in the rbtree: @@ -939,7 +906,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, @@ -1154,7 +1121,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, clock_id = CLOCK_MONOTONIC; timer->base = &cpu_base->clock_base[clock_id]; - INIT_LIST_HEAD(&timer->cb_entry); hrtimer_init_timer_hres(timer); #ifdef CONFIG_TIMER_STATS @@ -1173,7 +1139,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { - debug_hrtimer_init(timer); + debug_init(timer, clock_id, mode); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init); @@ -1197,7 +1163,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) } EXPORT_SYMBOL_GPL(hrtimer_get_res); -static void __run_hrtimer(struct hrtimer *timer) +static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) { struct hrtimer_clock_base *base = timer->base; struct hrtimer_cpu_base *cpu_base = base->cpu_base; @@ -1206,7 +1172,7 @@ static void __run_hrtimer(struct hrtimer *timer) WARN_ON(!irqs_disabled()); - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); timer_stats_account_hrtimer(timer); fn = timer->function; @@ -1217,7 +1183,9 @@ static void __run_hrtimer(struct hrtimer *timer) * the timer base. */ spin_unlock(&cpu_base->lock); + trace_hrtimer_expire_entry(timer, now); restart = fn(timer); + trace_hrtimer_expire_exit(timer); spin_lock(&cpu_base->lock); /* @@ -1328,7 +1296,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) break; } - __run_hrtimer(timer); + __run_hrtimer(timer, &basenow); } base++; } @@ -1450,7 +1418,7 @@ void hrtimer_run_queues(void) hrtimer_get_expires_tv64(timer)) break; - __run_hrtimer(timer); + __run_hrtimer(timer, &base->softirq_time); } spin_unlock(&cpu_base->lock); } @@ -1477,6 +1445,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) sl->timer.function = hrtimer_wakeup; sl->task = task; } +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { @@ -1626,7 +1595,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); /* * Mark it as STATE_MIGRATE not INACTIVE otherwise the diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 13c68e7..c166019 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -222,6 +222,34 @@ int set_irq_chip_data(unsigned int irq, void *data) } EXPORT_SYMBOL(set_irq_chip_data); +/** + * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq + * + * @irq: Interrupt number + * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag + * + * The IRQ_NESTED_THREAD flag indicates that on + * request_threaded_irq() no separate interrupt thread should be + * created for the irq as the handler are called nested in the + * context of a demultiplexing interrupt handler thread. + */ +void set_irq_nested_thread(unsigned int irq, int nest) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + + if (!desc) + return; + + spin_lock_irqsave(&desc->lock, flags); + if (nest) + desc->status |= IRQ_NESTED_THREAD; + else + desc->status &= ~IRQ_NESTED_THREAD; + spin_unlock_irqrestore(&desc->lock, flags); +} +EXPORT_SYMBOL_GPL(set_irq_nested_thread); + /* * default enable function */ @@ -299,6 +327,45 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) } } +/* + * handle_nested_irq - Handle a nested irq from a irq thread + * @irq: the interrupt number + * + * Handle interrupts which are nested into a threaded interrupt + * handler. The handler function is called inside the calling + * threads context. + */ +void handle_nested_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action; + irqreturn_t action_ret; + + might_sleep(); + + spin_lock_irq(&desc->lock); + + kstat_incr_irqs_this_cpu(irq, desc); + + action = desc->action; + if (unlikely(!action || (desc->status & IRQ_DISABLED))) + goto out_unlock; + + desc->status |= IRQ_INPROGRESS; + spin_unlock_irq(&desc->lock); + + action_ret = action->thread_fn(action->irq, action->dev_id); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + + spin_lock_irq(&desc->lock); + desc->status &= ~IRQ_INPROGRESS; + +out_unlock: + spin_unlock_irq(&desc->lock); +} +EXPORT_SYMBOL_GPL(handle_nested_irq); + /** * handle_simple_irq - Simple and software-decoded IRQs. * @irq: the interrupt number @@ -382,7 +449,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; - if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + + if (unlikely(desc->status & IRQ_ONESHOT)) + desc->status |= IRQ_MASKED; + else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) desc->chip->unmask(irq); out_unlock: spin_unlock(&desc->lock); @@ -572,6 +642,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->chip = &dummy_irq_chip; } + chip_bus_lock(irq, desc); spin_lock_irqsave(&desc->lock, flags); /* Uninstall? */ @@ -591,6 +662,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->chip->startup(irq); } spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL_GPL(__set_irq_handler); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 065205b..a81cf80 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -161,7 +161,7 @@ int __init early_irq_init(void) desc = irq_desc_legacy; legacy_count = ARRAY_SIZE(irq_desc_legacy); - node = first_online_node; + node = first_online_node; /* allocate irq_desc_ptrs array based on nr_irqs */ irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT); @@ -172,6 +172,9 @@ int __init early_irq_init(void) for (i = 0; i < legacy_count; i++) { desc[i].irq = i; +#ifdef CONFIG_SMP + desc[i].node = node; +#endif desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); alloc_desc_masks(&desc[i], node, true); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index e70ed55..1b5d742 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -44,6 +44,19 @@ extern int irq_select_affinity_usr(unsigned int irq); extern void irq_set_thread_affinity(struct irq_desc *desc); +/* Inline functions for support of irq chips on slow busses */ +static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc) +{ + if (unlikely(desc->chip->bus_lock)) + desc->chip->bus_lock(irq); +} + +static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc) +{ + if (unlikely(desc->chip->bus_sync_unlock)) + desc->chip->bus_sync_unlock(irq); +} + /* * Debugging printout: */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0ec9ed8..bde4c66 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -230,9 +230,11 @@ void disable_irq_nosync(unsigned int irq) if (!desc) return; + chip_bus_lock(irq, desc); spin_lock_irqsave(&desc->lock, flags); __disable_irq(desc, irq, false); spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL(disable_irq_nosync); @@ -294,7 +296,8 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) * matches the last disable, processing of interrupts on this * IRQ line is re-enabled. * - * This function may be called from IRQ context. + * This function may be called from IRQ context only when + * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! */ void enable_irq(unsigned int irq) { @@ -304,9 +307,11 @@ void enable_irq(unsigned int irq) if (!desc) return; + chip_bus_lock(irq, desc); spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL(enable_irq); @@ -436,6 +441,26 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, return ret; } +/* + * Default primary interrupt handler for threaded interrupts. Is + * assigned as primary handler when request_threaded_irq is called + * with handler == NULL. Useful for oneshot interrupts. + */ +static irqreturn_t irq_default_primary_handler(int irq, void *dev_id) +{ + return IRQ_WAKE_THREAD; +} + +/* + * Primary handler for nested threaded interrupts. Should never be + * called. + */ +static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) +{ + WARN(1, "Primary handler called for nested irq %d\n", irq); + return IRQ_NONE; +} + static int irq_wait_for_interrupt(struct irqaction *action) { while (!kthread_should_stop()) { @@ -451,6 +476,23 @@ static int irq_wait_for_interrupt(struct irqaction *action) return -1; } +/* + * Oneshot interrupts keep the irq line masked until the threaded + * handler finished. unmask if the interrupt has not been disabled and + * is marked MASKED. + */ +static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) +{ + chip_bus_lock(irq, desc); + spin_lock_irq(&desc->lock); + if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { + desc->status &= ~IRQ_MASKED; + desc->chip->unmask(irq); + } + spin_unlock_irq(&desc->lock); + chip_bus_sync_unlock(irq, desc); +} + #ifdef CONFIG_SMP /* * Check whether we need to change the affinity of the interrupt thread. @@ -492,7 +534,7 @@ static int irq_thread(void *data) struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); - int wake; + int wake, oneshot = desc->status & IRQ_ONESHOT; sched_setscheduler(current, SCHED_FIFO, ¶m); current->irqaction = action; @@ -518,6 +560,9 @@ static int irq_thread(void *data) spin_unlock_irq(&desc->lock); action->thread_fn(action->irq, action->dev_id); + + if (oneshot) + irq_finalize_oneshot(action->irq, desc); } wake = atomic_dec_and_test(&desc->threads_active); @@ -565,7 +610,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) struct irqaction *old, **old_ptr; const char *old_name = NULL; unsigned long flags; - int shared = 0; + int nested, shared = 0; int ret; if (!desc) @@ -590,10 +635,32 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) rand_initialize_irq(irq); } + /* Oneshot interrupts are not allowed with shared */ + if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED)) + return -EINVAL; + + /* + * Check whether the interrupt nests into another interrupt + * thread. + */ + nested = desc->status & IRQ_NESTED_THREAD; + if (nested) { + if (!new->thread_fn) + return -EINVAL; + /* + * Replace the primary handler which was provided from + * the driver for non nested interrupt handling by the + * dummy function which warns when called. + */ + new->handler = irq_nested_primary_handler; + } + /* - * Threaded handler ? + * Create a handler thread when a thread function is supplied + * and the interrupt does not nest into another interrupt + * thread. */ - if (new->thread_fn) { + if (new->thread_fn && !nested) { struct task_struct *t; t = kthread_create(irq_thread, new, "irq/%d-%s", irq, @@ -662,9 +729,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->status |= IRQ_PER_CPU; #endif - desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | + desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); + if (new->flags & IRQF_ONESHOT) + desc->status |= IRQ_ONESHOT; + if (!(desc->status & IRQ_NOAUTOEN)) { desc->depth = 0; desc->status &= ~IRQ_DISABLED; @@ -875,7 +945,14 @@ EXPORT_SYMBOL_GPL(remove_irq); */ void free_irq(unsigned int irq, void *dev_id) { + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc) + return; + + chip_bus_lock(irq, desc); kfree(__free_irq(irq, dev_id)); + chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL(free_irq); @@ -884,6 +961,8 @@ EXPORT_SYMBOL(free_irq); * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Primary handler for threaded interrupts + * If NULL and thread_fn != NULL the default + * primary handler is installed * @thread_fn: Function called from the irq handler thread * If NULL, no irq thread is created * @irqflags: Interrupt type flags @@ -963,8 +1042,12 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (desc->status & IRQ_NOREQUEST) return -EINVAL; - if (!handler) - return -EINVAL; + + if (!handler) { + if (!thread_fn) + return -EINVAL; + handler = irq_default_primary_handler; + } action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) @@ -976,7 +1059,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, action->name = devname; action->dev_id = dev_id; + chip_bus_lock(irq, desc); retval = __setup_irq(irq, desc, action); + chip_bus_sync_unlock(irq, desc); + if (retval) kfree(action); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 638d8be..a0bb09e 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -15,10 +15,10 @@ /** * suspend_device_irqs - disable all currently enabled interrupt lines * - * During system-wide suspend or hibernation device interrupts need to be - * disabled at the chip level and this function is provided for this purpose. - * It disables all interrupt lines that are enabled at the moment and sets the - * IRQ_SUSPENDED flag for them. + * During system-wide suspend or hibernation device drivers need to be prevented + * from receiving interrupts and this function is provided for this purpose. + * It marks all interrupt lines in use, except for the timer ones, as disabled + * and sets the IRQ_SUSPENDED flag for each of them. */ void suspend_device_irqs(void) { diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 89c7117..090c376 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -70,8 +70,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; - if (!desc->chip || !desc->chip->retrigger || - !desc->chip->retrigger(irq)) { + if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) { #ifdef CONFIG_HARDIRQS_SW_RESEND /* Set it pending and activate the softirq: */ set_bit(irq, irqs_resend); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 4d56829..114e704 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -297,7 +297,6 @@ static int __init irqfixup_setup(char *str) __setup("irqfixup", irqfixup_setup); module_param(irqfixup, int, 0644); -MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode"); static int __init irqpoll_setup(char *str) { diff --git a/kernel/itimer.c b/kernel/itimer.c index 58762f7..b03451e 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -12,6 +12,7 @@ #include <linux/time.h> #include <linux/posix-timers.h> #include <linux/hrtimer.h> +#include <trace/events/timer.h> #include <asm/uaccess.h> @@ -41,10 +42,43 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer) return ktime_to_timeval(rem); } +static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + struct itimerval *const value) +{ + cputime_t cval, cinterval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (!cputime_eq(cval, cputime_zero)) { + struct task_cputime cputime; + cputime_t t; + + thread_group_cputimer(tsk, &cputime); + if (clock_id == CPUCLOCK_PROF) + t = cputime_add(cputime.utime, cputime.stime); + else + /* CPUCLOCK_VIRT */ + t = cputime.utime; + + if (cputime_le(cval, t)) + /* about to fire */ + cval = cputime_one_jiffy; + else + cval = cputime_sub(cval, t); + } + + spin_unlock_irq(&tsk->sighand->siglock); + + cputime_to_timeval(cval, &value->it_value); + cputime_to_timeval(cinterval, &value->it_interval); +} + int do_getitimer(int which, struct itimerval *value) { struct task_struct *tsk = current; - cputime_t cinterval, cval; switch (which) { case ITIMER_REAL: @@ -55,44 +89,10 @@ int do_getitimer(int which, struct itimerval *value) spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_virt_expires; - cinterval = tsk->signal->it_virt_incr; - if (!cputime_eq(cval, cputime_zero)) { - struct task_cputime cputime; - cputime_t utime; - - thread_group_cputimer(tsk, &cputime); - utime = cputime.utime; - if (cputime_le(cval, utime)) { /* about to fire */ - cval = jiffies_to_cputime(1); - } else { - cval = cputime_sub(cval, utime); - } - } - spin_unlock_irq(&tsk->sighand->siglock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); + get_cpu_itimer(tsk, CPUCLOCK_VIRT, value); break; case ITIMER_PROF: - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_prof_expires; - cinterval = tsk->signal->it_prof_incr; - if (!cputime_eq(cval, cputime_zero)) { - struct task_cputime times; - cputime_t ptime; - - thread_group_cputimer(tsk, ×); - ptime = cputime_add(times.utime, times.stime); - if (cputime_le(cval, ptime)) { /* about to fire */ - cval = jiffies_to_cputime(1); - } else { - cval = cputime_sub(cval, ptime); - } - } - spin_unlock_irq(&tsk->sighand->siglock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); + get_cpu_itimer(tsk, CPUCLOCK_PROF, value); break; default: return(-EINVAL); @@ -123,11 +123,62 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) struct signal_struct *sig = container_of(timer, struct signal_struct, real_timer); + trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); return HRTIMER_NORESTART; } +static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns) +{ + struct timespec ts; + s64 cpu_ns; + + cputime_to_timespec(ct, &ts); + cpu_ns = timespec_to_ns(&ts); + + return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns; +} + +static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + const struct itimerval *const value, + struct itimerval *const ovalue) +{ + cputime_t cval, nval, cinterval, ninterval; + s64 ns_ninterval, ns_nval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + nval = timeval_to_cputime(&value->it_value); + ns_nval = timeval_to_ns(&value->it_value); + ninterval = timeval_to_cputime(&value->it_interval); + ns_ninterval = timeval_to_ns(&value->it_interval); + + it->incr_error = cputime_sub_ns(ninterval, ns_ninterval); + it->error = cputime_sub_ns(nval, ns_nval); + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (!cputime_eq(cval, cputime_zero) || + !cputime_eq(nval, cputime_zero)) { + if (cputime_gt(nval, cputime_zero)) + nval = cputime_add(nval, cputime_one_jiffy); + set_process_cpu_timer(tsk, clock_id, &nval, &cval); + } + it->expires = nval; + it->incr = ninterval; + trace_itimer_state(clock_id == CPUCLOCK_VIRT ? + ITIMER_VIRTUAL : ITIMER_PROF, value, nval); + + spin_unlock_irq(&tsk->sighand->siglock); + + if (ovalue) { + cputime_to_timeval(cval, &ovalue->it_value); + cputime_to_timeval(cinterval, &ovalue->it_interval); + } +} + /* * Returns true if the timeval is in canonical form */ @@ -139,7 +190,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) struct task_struct *tsk = current; struct hrtimer *timer; ktime_t expires; - cputime_t cval, cinterval, nval, ninterval; /* * Validate the timevals in value. @@ -171,51 +221,14 @@ again: } else tsk->signal->it_real_incr.tv64 = 0; + trace_itimer_state(ITIMER_REAL, value, 0); spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - nval = timeval_to_cputime(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_virt_expires; - cinterval = tsk->signal->it_virt_incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, - jiffies_to_cputime(1)); - set_process_cpu_timer(tsk, CPUCLOCK_VIRT, - &nval, &cval); - } - tsk->signal->it_virt_expires = nval; - tsk->signal->it_virt_incr = ninterval; - spin_unlock_irq(&tsk->sighand->siglock); - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } + set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue); break; case ITIMER_PROF: - nval = timeval_to_cputime(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_prof_expires; - cinterval = tsk->signal->it_prof_incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, - jiffies_to_cputime(1)); - set_process_cpu_timer(tsk, CPUCLOCK_PROF, - &nval, &cval); - } - tsk->signal->it_prof_expires = nval; - tsk->signal->it_prof_incr = ninterval; - spin_unlock_irq(&tsk->sighand->siglock); - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } + set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue); break; default: return -EINVAL; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 3a29dbe..8b6b8b6 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -59,7 +59,8 @@ static inline int is_kernel_inittext(unsigned long addr) static inline int is_kernel_text(unsigned long addr) { - if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) + if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || + arch_is_kernel_text(addr)) return 1; return in_gate_area_no_task(addr); } diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 26539e3..3765ff3 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -117,7 +117,7 @@ EXPORT_SYMBOL(kfifo_free); * writer, you don't need extra locking to use these functions. */ unsigned int __kfifo_put(struct kfifo *fifo, - unsigned char *buffer, unsigned int len) + const unsigned char *buffer, unsigned int len) { unsigned int l; diff --git a/kernel/kmod.c b/kernel/kmod.c index a922808..689d20f 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -80,6 +80,10 @@ int __request_module(bool wait, const char *fmt, ...) #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; + ret = security_kernel_module_request(); + if (ret) + return ret; + va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); @@ -139,6 +143,7 @@ struct subprocess_info { static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; + enum umh_wait wait = sub_info->wait; int retval; BUG_ON(atomic_read(&sub_info->cred->usage) != 1); @@ -180,10 +185,14 @@ static int ____call_usermodehelper(void *data) */ set_user_nice(current, 0); + if (wait == UMH_WAIT_EXEC) + complete(sub_info->complete); + retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); /* Exec failed? */ - sub_info->retval = retval; + if (wait != UMH_WAIT_EXEC) + sub_info->retval = retval; do_exit(0); } @@ -262,16 +271,14 @@ static void __call_usermodehelper(struct work_struct *work) switch (wait) { case UMH_NO_WAIT: + case UMH_WAIT_EXEC: break; case UMH_WAIT_PROC: if (pid > 0) break; sub_info->retval = pid; - /* FALLTHROUGH */ - - case UMH_WAIT_EXEC: - complete(sub_info->complete); + break; } } @@ -466,6 +473,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int retval = 0; BUG_ON(atomic_read(&sub_info->cred->usage) != 1); + validate_creds(sub_info->cred); helper_lock(); if (sub_info->path[0] == '\0') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b946761..b466afa 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1349,7 +1349,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) return 0; } -static struct seq_operations kprobes_seq_ops = { +static const struct seq_operations kprobes_seq_ops = { .start = kprobe_seq_start, .next = kprobe_seq_next, .stop = kprobe_seq_stop, diff --git a/kernel/kthread.c b/kernel/kthread.c index eb8751a..5fe7099 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -16,8 +16,6 @@ #include <linux/mutex.h> #include <trace/events/sched.h> -#define KTHREAD_NICE_LEVEL (-5) - static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; @@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), * The kernel thread should not inherit these properties. */ sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m); - set_user_nice(create.result, KTHREAD_NICE_LEVEL); set_cpus_allowed_ptr(create.result, cpu_all_mask); } return create.result; @@ -221,7 +218,6 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); - set_user_nice(tsk, KTHREAD_NICE_LEVEL); set_cpus_allowed_ptr(tsk, cpu_all_mask); set_mems_allowed(node_possible_map); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 8bbeef9..3815ac1d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -42,6 +42,7 @@ #include <linux/hash.h> #include <linux/ftrace.h> #include <linux/stringify.h> +#include <linux/bitops.h> #include <asm/sections.h> @@ -366,11 +367,21 @@ static int save_trace(struct stack_trace *trace) save_stack_trace(trace); + /* + * Some daft arches put -1 at the end to indicate its a full trace. + * + * <rant> this is buggy anyway, since it takes a whole extra entry so a + * complete trace that maxes out the entries provided will be reported + * as incomplete, friggin useless </rant> + */ + if (trace->entries[trace->nr_entries-1] == ULONG_MAX) + trace->nr_entries--; + trace->max_entries = trace->nr_entries; nr_stack_trace_entries += trace->nr_entries; - if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { + if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { if (!debug_locks_off_graph_unlock()) return 0; @@ -388,20 +399,6 @@ unsigned int nr_hardirq_chains; unsigned int nr_softirq_chains; unsigned int nr_process_chains; unsigned int max_lockdep_depth; -unsigned int max_recursion_depth; - -static unsigned int lockdep_dependency_gen_id; - -static bool lockdep_dependency_visit(struct lock_class *source, - unsigned int depth) -{ - if (!depth) - lockdep_dependency_gen_id++; - if (source->dep_gen_id == lockdep_dependency_gen_id) - return true; - source->dep_gen_id = lockdep_dependency_gen_id; - return false; -} #ifdef CONFIG_DEBUG_LOCKDEP /* @@ -431,11 +428,8 @@ atomic_t redundant_softirqs_on; atomic_t redundant_softirqs_off; atomic_t nr_unused_locks; atomic_t nr_cyclic_checks; -atomic_t nr_cyclic_check_recursions; atomic_t nr_find_usage_forwards_checks; -atomic_t nr_find_usage_forwards_recursions; atomic_t nr_find_usage_backwards_checks; -atomic_t nr_find_usage_backwards_recursions; #endif /* @@ -551,58 +545,6 @@ static void lockdep_print_held_locks(struct task_struct *curr) } } -static void print_lock_class_header(struct lock_class *class, int depth) -{ - int bit; - - printk("%*s->", depth, ""); - print_lock_name(class); - printk(" ops: %lu", class->ops); - printk(" {\n"); - - for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { - if (class->usage_mask & (1 << bit)) { - int len = depth; - - len += printk("%*s %s", depth, "", usage_str[bit]); - len += printk(" at:\n"); - print_stack_trace(class->usage_traces + bit, len); - } - } - printk("%*s }\n", depth, ""); - - printk("%*s ... key at: ",depth,""); - print_ip_sym((unsigned long)class->key); -} - -/* - * printk all lock dependencies starting at <entry>: - */ -static void __used -print_lock_dependencies(struct lock_class *class, int depth) -{ - struct lock_list *entry; - - if (lockdep_dependency_visit(class, depth)) - return; - - if (DEBUG_LOCKS_WARN_ON(depth >= 20)) - return; - - print_lock_class_header(class, depth); - - list_for_each_entry(entry, &class->locks_after, entry) { - if (DEBUG_LOCKS_WARN_ON(!entry->class)) - return; - - print_lock_dependencies(entry->class, depth + 1); - - printk("%*s ... acquired at:\n",depth,""); - print_stack_trace(&entry->trace, 2); - printk("\n"); - } -} - static void print_kernel_version(void) { printk("%s %.*s\n", init_utsname()->release, @@ -636,6 +578,9 @@ static int static_obj(void *obj) if ((addr >= start) && (addr < end)) return 1; + if (arch_is_kernel_data(addr)) + return 1; + #ifdef CONFIG_SMP /* * percpu var? @@ -898,22 +843,203 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, } /* + * For good efficiency of modular, we use power of 2 + */ +#define MAX_CIRCULAR_QUEUE_SIZE 4096UL +#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) + +/* + * The circular_queue and helpers is used to implement the + * breadth-first search(BFS)algorithem, by which we can build + * the shortest path from the next lock to be acquired to the + * previous held lock if there is a circular between them. + */ +struct circular_queue { + unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; + unsigned int front, rear; +}; + +static struct circular_queue lock_cq; + +unsigned int max_bfs_queue_depth; + +static unsigned int lockdep_dependency_gen_id; + +static inline void __cq_init(struct circular_queue *cq) +{ + cq->front = cq->rear = 0; + lockdep_dependency_gen_id++; +} + +static inline int __cq_empty(struct circular_queue *cq) +{ + return (cq->front == cq->rear); +} + +static inline int __cq_full(struct circular_queue *cq) +{ + return ((cq->rear + 1) & CQ_MASK) == cq->front; +} + +static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) +{ + if (__cq_full(cq)) + return -1; + + cq->element[cq->rear] = elem; + cq->rear = (cq->rear + 1) & CQ_MASK; + return 0; +} + +static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) +{ + if (__cq_empty(cq)) + return -1; + + *elem = cq->element[cq->front]; + cq->front = (cq->front + 1) & CQ_MASK; + return 0; +} + +static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) +{ + return (cq->rear - cq->front) & CQ_MASK; +} + +static inline void mark_lock_accessed(struct lock_list *lock, + struct lock_list *parent) +{ + unsigned long nr; + + nr = lock - list_entries; + WARN_ON(nr >= nr_list_entries); + lock->parent = parent; + lock->class->dep_gen_id = lockdep_dependency_gen_id; +} + +static inline unsigned long lock_accessed(struct lock_list *lock) +{ + unsigned long nr; + + nr = lock - list_entries; + WARN_ON(nr >= nr_list_entries); + return lock->class->dep_gen_id == lockdep_dependency_gen_id; +} + +static inline struct lock_list *get_lock_parent(struct lock_list *child) +{ + return child->parent; +} + +static inline int get_lock_depth(struct lock_list *child) +{ + int depth = 0; + struct lock_list *parent; + + while ((parent = get_lock_parent(child))) { + child = parent; + depth++; + } + return depth; +} + +static int __bfs(struct lock_list *source_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry, + int forward) +{ + struct lock_list *entry; + struct list_head *head; + struct circular_queue *cq = &lock_cq; + int ret = 1; + + if (match(source_entry, data)) { + *target_entry = source_entry; + ret = 0; + goto exit; + } + + if (forward) + head = &source_entry->class->locks_after; + else + head = &source_entry->class->locks_before; + + if (list_empty(head)) + goto exit; + + __cq_init(cq); + __cq_enqueue(cq, (unsigned long)source_entry); + + while (!__cq_empty(cq)) { + struct lock_list *lock; + + __cq_dequeue(cq, (unsigned long *)&lock); + + if (!lock->class) { + ret = -2; + goto exit; + } + + if (forward) + head = &lock->class->locks_after; + else + head = &lock->class->locks_before; + + list_for_each_entry(entry, head, entry) { + if (!lock_accessed(entry)) { + unsigned int cq_depth; + mark_lock_accessed(entry, lock); + if (match(entry, data)) { + *target_entry = entry; + ret = 0; + goto exit; + } + + if (__cq_enqueue(cq, (unsigned long)entry)) { + ret = -1; + goto exit; + } + cq_depth = __cq_get_elem_count(cq); + if (max_bfs_queue_depth < cq_depth) + max_bfs_queue_depth = cq_depth; + } + } + } +exit: + return ret; +} + +static inline int __bfs_forwards(struct lock_list *src_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) +{ + return __bfs(src_entry, data, match, target_entry, 1); + +} + +static inline int __bfs_backwards(struct lock_list *src_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) +{ + return __bfs(src_entry, data, match, target_entry, 0); + +} + +/* * Recursive, forwards-direction lock-dependency checking, used for * both noncyclic checking and for hardirq-unsafe/softirq-unsafe * checking. - * - * (to keep the stackframe of the recursive functions small we - * use these global variables, and we also mark various helper - * functions as noinline.) */ -static struct held_lock *check_source, *check_target; /* * Print a dependency chain entry (this is only done when a deadlock * has been detected): */ static noinline int -print_circular_bug_entry(struct lock_list *target, unsigned int depth) +print_circular_bug_entry(struct lock_list *target, int depth) { if (debug_locks_silent) return 0; @@ -930,11 +1056,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth) * header first: */ static noinline int -print_circular_bug_header(struct lock_list *entry, unsigned int depth) +print_circular_bug_header(struct lock_list *entry, unsigned int depth, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; - if (!debug_locks_off_graph_unlock() || debug_locks_silent) + if (debug_locks_silent) return 0; printk("\n=======================================================\n"); @@ -943,9 +1071,9 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth) printk( "-------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); - print_lock(check_source); + print_lock(check_src); printk("\nbut task is already holding lock:\n"); - print_lock(check_target); + print_lock(check_tgt); printk("\nwhich lock already depends on the new lock.\n\n"); printk("\nthe existing dependency chain (in reverse order) is:\n"); @@ -954,19 +1082,36 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth) return 0; } -static noinline int print_circular_bug_tail(void) +static inline int class_equal(struct lock_list *entry, void *data) +{ + return entry->class == data; +} + +static noinline int print_circular_bug(struct lock_list *this, + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; - struct lock_list this; + struct lock_list *parent; + int depth; - if (debug_locks_silent) + if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - this.class = hlock_class(check_source); - if (!save_trace(&this.trace)) + if (!save_trace(&this->trace)) return 0; - print_circular_bug_entry(&this, 0); + depth = get_lock_depth(target); + + print_circular_bug_header(target, depth, check_src, check_tgt); + + parent = get_lock_parent(target); + + while (parent) { + print_circular_bug_entry(parent, --depth); + parent = get_lock_parent(parent); + } printk("\nother info that might help us debug this:\n\n"); lockdep_print_held_locks(curr); @@ -977,73 +1122,69 @@ static noinline int print_circular_bug_tail(void) return 0; } -#define RECURSION_LIMIT 40 - -static int noinline print_infinite_recursion_bug(void) +static noinline int print_bfs_bug(int ret) { if (!debug_locks_off_graph_unlock()) return 0; - WARN_ON(1); + WARN(1, "lockdep bfs error:%d\n", ret); return 0; } -unsigned long __lockdep_count_forward_deps(struct lock_class *class, - unsigned int depth) +static int noop_count(struct lock_list *entry, void *data) { - struct lock_list *entry; - unsigned long ret = 1; + (*(unsigned long *)data)++; + return 0; +} - if (lockdep_dependency_visit(class, depth)) - return 0; +unsigned long __lockdep_count_forward_deps(struct lock_list *this) +{ + unsigned long count = 0; + struct lock_list *uninitialized_var(target_entry); - /* - * Recurse this class's dependency list: - */ - list_for_each_entry(entry, &class->locks_after, entry) - ret += __lockdep_count_forward_deps(entry->class, depth + 1); + __bfs_forwards(this, (void *)&count, noop_count, &target_entry); - return ret; + return count; } - unsigned long lockdep_count_forward_deps(struct lock_class *class) { unsigned long ret, flags; + struct lock_list this; + + this.parent = NULL; + this.class = class; local_irq_save(flags); __raw_spin_lock(&lockdep_lock); - ret = __lockdep_count_forward_deps(class, 0); + ret = __lockdep_count_forward_deps(&this); __raw_spin_unlock(&lockdep_lock); local_irq_restore(flags); return ret; } -unsigned long __lockdep_count_backward_deps(struct lock_class *class, - unsigned int depth) +unsigned long __lockdep_count_backward_deps(struct lock_list *this) { - struct lock_list *entry; - unsigned long ret = 1; + unsigned long count = 0; + struct lock_list *uninitialized_var(target_entry); - if (lockdep_dependency_visit(class, depth)) - return 0; - /* - * Recurse this class's dependency list: - */ - list_for_each_entry(entry, &class->locks_before, entry) - ret += __lockdep_count_backward_deps(entry->class, depth + 1); + __bfs_backwards(this, (void *)&count, noop_count, &target_entry); - return ret; + return count; } unsigned long lockdep_count_backward_deps(struct lock_class *class) { unsigned long ret, flags; + struct lock_list this; + + this.parent = NULL; + this.class = class; local_irq_save(flags); __raw_spin_lock(&lockdep_lock); - ret = __lockdep_count_backward_deps(class, 0); + ret = __lockdep_count_backward_deps(&this); __raw_spin_unlock(&lockdep_lock); local_irq_restore(flags); @@ -1055,29 +1196,16 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) * lead to <target>. Print an error and return 0 if it does. */ static noinline int -check_noncircular(struct lock_class *source, unsigned int depth) +check_noncircular(struct lock_list *root, struct lock_class *target, + struct lock_list **target_entry) { - struct lock_list *entry; + int result; - if (lockdep_dependency_visit(source, depth)) - return 1; + debug_atomic_inc(&nr_cyclic_checks); - debug_atomic_inc(&nr_cyclic_check_recursions); - if (depth > max_recursion_depth) - max_recursion_depth = depth; - if (depth >= RECURSION_LIMIT) - return print_infinite_recursion_bug(); - /* - * Check this lock's dependency list: - */ - list_for_each_entry(entry, &source->locks_after, entry) { - if (entry->class == hlock_class(check_target)) - return print_circular_bug_header(entry, depth+1); - debug_atomic_inc(&nr_cyclic_checks); - if (!check_noncircular(entry->class, depth+1)) - return print_circular_bug_entry(entry, depth+1); - } - return 1; + result = __bfs_forwards(root, target, class_equal, target_entry); + + return result; } #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) @@ -1086,103 +1214,121 @@ check_noncircular(struct lock_class *source, unsigned int depth) * proving that two subgraphs can be connected by a new dependency * without creating any illegal irq-safe -> irq-unsafe lock dependency. */ -static enum lock_usage_bit find_usage_bit; -static struct lock_class *forwards_match, *backwards_match; + +static inline int usage_match(struct lock_list *entry, void *bit) +{ + return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); +} + + /* * Find a node in the forwards-direction dependency sub-graph starting - * at <source> that matches <find_usage_bit>. + * at @root->class that matches @bit. * - * Return 2 if such a node exists in the subgraph, and put that node - * into <forwards_match>. + * Return 0 if such a node exists in the subgraph, and put that node + * into *@target_entry. * - * Return 1 otherwise and keep <forwards_match> unchanged. - * Return 0 on error. + * Return 1 otherwise and keep *@target_entry unchanged. + * Return <0 on error. */ -static noinline int -find_usage_forwards(struct lock_class *source, unsigned int depth) +static int +find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, + struct lock_list **target_entry) { - struct lock_list *entry; - int ret; - - if (lockdep_dependency_visit(source, depth)) - return 1; - - if (depth > max_recursion_depth) - max_recursion_depth = depth; - if (depth >= RECURSION_LIMIT) - return print_infinite_recursion_bug(); + int result; debug_atomic_inc(&nr_find_usage_forwards_checks); - if (source->usage_mask & (1 << find_usage_bit)) { - forwards_match = source; - return 2; - } - /* - * Check this lock's dependency list: - */ - list_for_each_entry(entry, &source->locks_after, entry) { - debug_atomic_inc(&nr_find_usage_forwards_recursions); - ret = find_usage_forwards(entry->class, depth+1); - if (ret == 2 || ret == 0) - return ret; - } - return 1; + result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); + + return result; } /* * Find a node in the backwards-direction dependency sub-graph starting - * at <source> that matches <find_usage_bit>. + * at @root->class that matches @bit. * - * Return 2 if such a node exists in the subgraph, and put that node - * into <backwards_match>. + * Return 0 if such a node exists in the subgraph, and put that node + * into *@target_entry. * - * Return 1 otherwise and keep <backwards_match> unchanged. - * Return 0 on error. + * Return 1 otherwise and keep *@target_entry unchanged. + * Return <0 on error. */ -static noinline int -find_usage_backwards(struct lock_class *source, unsigned int depth) +static int +find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, + struct lock_list **target_entry) { - struct lock_list *entry; - int ret; + int result; - if (lockdep_dependency_visit(source, depth)) - return 1; + debug_atomic_inc(&nr_find_usage_backwards_checks); - if (!__raw_spin_is_locked(&lockdep_lock)) - return DEBUG_LOCKS_WARN_ON(1); + result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); - if (depth > max_recursion_depth) - max_recursion_depth = depth; - if (depth >= RECURSION_LIMIT) - return print_infinite_recursion_bug(); + return result; +} - debug_atomic_inc(&nr_find_usage_backwards_checks); - if (source->usage_mask & (1 << find_usage_bit)) { - backwards_match = source; - return 2; - } +static void print_lock_class_header(struct lock_class *class, int depth) +{ + int bit; - if (!source && debug_locks_off_graph_unlock()) { - WARN_ON(1); - return 0; - } + printk("%*s->", depth, ""); + print_lock_name(class); + printk(" ops: %lu", class->ops); + printk(" {\n"); - /* - * Check this lock's dependency list: - */ - list_for_each_entry(entry, &source->locks_before, entry) { - debug_atomic_inc(&nr_find_usage_backwards_recursions); - ret = find_usage_backwards(entry->class, depth+1); - if (ret == 2 || ret == 0) - return ret; + for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { + if (class->usage_mask & (1 << bit)) { + int len = depth; + + len += printk("%*s %s", depth, "", usage_str[bit]); + len += printk(" at:\n"); + print_stack_trace(class->usage_traces + bit, len); + } } - return 1; + printk("%*s }\n", depth, ""); + + printk("%*s ... key at: ",depth,""); + print_ip_sym((unsigned long)class->key); +} + +/* + * printk the shortest lock dependencies from @start to @end in reverse order: + */ +static void __used +print_shortest_lock_dependencies(struct lock_list *leaf, + struct lock_list *root) +{ + struct lock_list *entry = leaf; + int depth; + + /*compute depth from generated tree by BFS*/ + depth = get_lock_depth(leaf); + + do { + print_lock_class_header(entry->class, depth); + printk("%*s ... acquired at:\n", depth, ""); + print_stack_trace(&entry->trace, 2); + printk("\n"); + + if (depth == 0 && (entry != root)) { + printk("lockdep:%s bad BFS generated tree\n", __func__); + break; + } + + entry = get_lock_parent(entry); + depth--; + } while (entry && (depth >= 0)); + + return; } static int print_bad_irq_dependency(struct task_struct *curr, + struct lock_list *prev_root, + struct lock_list *next_root, + struct lock_list *backwards_entry, + struct lock_list *forwards_entry, struct held_lock *prev, struct held_lock *next, enum lock_usage_bit bit1, @@ -1215,26 +1361,32 @@ print_bad_irq_dependency(struct task_struct *curr, printk("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); - print_lock_name(backwards_match); + print_lock_name(backwards_entry->class); printk("\n... which became %s-irq-safe at:\n", irqclass); - print_stack_trace(backwards_match->usage_traces + bit1, 1); + print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); printk("\nto a %s-irq-unsafe lock:\n", irqclass); - print_lock_name(forwards_match); + print_lock_name(forwards_entry->class); printk("\n... which became %s-irq-unsafe at:\n", irqclass); printk("..."); - print_stack_trace(forwards_match->usage_traces + bit2, 1); + print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); printk("\nother info that might help us debug this:\n\n"); lockdep_print_held_locks(curr); - printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass); - print_lock_dependencies(backwards_match, 0); + printk("\nthe dependencies between %s-irq-safe lock", irqclass); + printk(" and the holding lock:\n"); + if (!save_trace(&prev_root->trace)) + return 0; + print_shortest_lock_dependencies(backwards_entry, prev_root); - printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass); - print_lock_dependencies(forwards_match, 0); + printk("\nthe dependencies between the lock to be acquired"); + printk(" and %s-irq-unsafe lock:\n", irqclass); + if (!save_trace(&next_root->trace)) + return 0; + print_shortest_lock_dependencies(forwards_entry, next_root); printk("\nstack backtrace:\n"); dump_stack(); @@ -1248,19 +1400,30 @@ check_usage(struct task_struct *curr, struct held_lock *prev, enum lock_usage_bit bit_forwards, const char *irqclass) { int ret; + struct lock_list this, that; + struct lock_list *uninitialized_var(target_entry); + struct lock_list *uninitialized_var(target_entry1); + + this.parent = NULL; - find_usage_bit = bit_backwards; - /* fills in <backwards_match> */ - ret = find_usage_backwards(hlock_class(prev), 0); - if (!ret || ret == 1) + this.class = hlock_class(prev); + ret = find_usage_backwards(&this, bit_backwards, &target_entry); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) return ret; - find_usage_bit = bit_forwards; - ret = find_usage_forwards(hlock_class(next), 0); - if (!ret || ret == 1) + that.parent = NULL; + that.class = hlock_class(next); + ret = find_usage_forwards(&that, bit_forwards, &target_entry1); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) return ret; - /* ret == 2 */ - return print_bad_irq_dependency(curr, prev, next, + + return print_bad_irq_dependency(curr, &this, &that, + target_entry, target_entry1, + prev, next, bit_backwards, bit_forwards, irqclass); } @@ -1472,6 +1635,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, { struct lock_list *entry; int ret; + struct lock_list this; + struct lock_list *uninitialized_var(target_entry); /* * Prove that the new <prev> -> <next> dependency would not @@ -1482,10 +1647,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * We are using global variables to control the recursion, to * keep the stackframe size of the recursive functions low: */ - check_source = next; - check_target = prev; - if (!(check_noncircular(hlock_class(next), 0))) - return print_circular_bug_tail(); + this.class = hlock_class(next); + this.parent = NULL; + ret = check_noncircular(&this, hlock_class(prev), &target_entry); + if (unlikely(!ret)) + return print_circular_bug(&this, target_entry, next, prev); + else if (unlikely(ret < 0)) + return print_bfs_bug(ret); if (!check_prev_add_irq(curr, prev, next)) return 0; @@ -1884,7 +2052,8 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, * print irq inversion bug: */ static int -print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, +print_irq_inversion_bug(struct task_struct *curr, + struct lock_list *root, struct lock_list *other, struct held_lock *this, int forwards, const char *irqclass) { @@ -1902,17 +2071,16 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); else printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); - print_lock_name(other); + print_lock_name(other->class); printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); printk("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nthe first lock's dependencies:\n"); - print_lock_dependencies(hlock_class(this), 0); - - printk("\nthe second lock's dependencies:\n"); - print_lock_dependencies(other, 0); + printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); + if (!save_trace(&root->trace)) + return 0; + print_shortest_lock_dependencies(other, root); printk("\nstack backtrace:\n"); dump_stack(); @@ -1929,14 +2097,19 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit bit, const char *irqclass) { int ret; - - find_usage_bit = bit; - /* fills in <forwards_match> */ - ret = find_usage_forwards(hlock_class(this), 0); - if (!ret || ret == 1) + struct lock_list root; + struct lock_list *uninitialized_var(target_entry); + + root.parent = NULL; + root.class = hlock_class(this); + ret = find_usage_forwards(&root, bit, &target_entry); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) return ret; - return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass); + return print_irq_inversion_bug(curr, &root, target_entry, + this, 1, irqclass); } /* @@ -1948,14 +2121,19 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit bit, const char *irqclass) { int ret; - - find_usage_bit = bit; - /* fills in <backwards_match> */ - ret = find_usage_backwards(hlock_class(this), 0); - if (!ret || ret == 1) + struct lock_list root; + struct lock_list *uninitialized_var(target_entry); + + root.parent = NULL; + root.class = hlock_class(this); + ret = find_usage_backwards(&root, bit, &target_entry); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) return ret; - return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); + return print_irq_inversion_bug(curr, &root, target_entry, + this, 1, irqclass); } void print_irqtrace_events(struct task_struct *curr) @@ -2530,13 +2708,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map); */ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, - struct lockdep_map *nest_lock, unsigned long ip) + struct lockdep_map *nest_lock, unsigned long ip, + int references) { struct task_struct *curr = current; struct lock_class *class = NULL; struct held_lock *hlock; unsigned int depth, id; int chain_head = 0; + int class_idx; u64 chain_key; if (!prove_locking) @@ -2584,10 +2764,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) return 0; + class_idx = class - lock_classes + 1; + + if (depth) { + hlock = curr->held_locks + depth - 1; + if (hlock->class_idx == class_idx && nest_lock) { + if (hlock->references) + hlock->references++; + else + hlock->references = 2; + + return 1; + } + } + hlock = curr->held_locks + depth; if (DEBUG_LOCKS_WARN_ON(!class)) return 0; - hlock->class_idx = class - lock_classes + 1; + hlock->class_idx = class_idx; hlock->acquire_ip = ip; hlock->instance = lock; hlock->nest_lock = nest_lock; @@ -2595,6 +2789,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->read = read; hlock->check = check; hlock->hardirqs_off = !!hardirqs_off; + hlock->references = references; #ifdef CONFIG_LOCK_STAT hlock->waittime_stamp = 0; hlock->holdtime_stamp = sched_clock(); @@ -2703,6 +2898,30 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, return 1; } +static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) +{ + if (hlock->instance == lock) + return 1; + + if (hlock->references) { + struct lock_class *class = lock->class_cache; + + if (!class) + class = look_up_lock_class(lock, 0); + + if (DEBUG_LOCKS_WARN_ON(!class)) + return 0; + + if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) + return 0; + + if (hlock->class_idx == class - lock_classes + 1) + return 1; + } + + return 0; +} + static int __lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, @@ -2726,7 +2945,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name, */ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) break; - if (hlock->instance == lock) + if (match_held_lock(hlock, lock)) goto found_it; prev_hlock = hlock; } @@ -2745,7 +2964,8 @@ found_it: if (!__lock_acquire(hlock->instance, hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip)) + hlock->nest_lock, hlock->acquire_ip, + hlock->references)) return 0; } @@ -2784,20 +3004,34 @@ lock_release_non_nested(struct task_struct *curr, */ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) break; - if (hlock->instance == lock) + if (match_held_lock(hlock, lock)) goto found_it; prev_hlock = hlock; } return print_unlock_inbalance_bug(curr, lock, ip); found_it: - lock_release_holdtime(hlock); + if (hlock->instance == lock) + lock_release_holdtime(hlock); + + if (hlock->references) { + hlock->references--; + if (hlock->references) { + /* + * We had, and after removing one, still have + * references, the current lock stack is still + * valid. We're done! + */ + return 1; + } + } /* * We have the right lock to unlock, 'hlock' points to it. * Now we remove it from the stack, and add back the other * entries (if any), recalculating the hash along the way: */ + curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; @@ -2806,7 +3040,8 @@ found_it: if (!__lock_acquire(hlock->instance, hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip)) + hlock->nest_lock, hlock->acquire_ip, + hlock->references)) return 0; } @@ -2836,7 +3071,7 @@ static int lock_release_nested(struct task_struct *curr, /* * Is the unlock non-nested: */ - if (hlock->instance != lock) + if (hlock->instance != lock || hlock->references) return lock_release_non_nested(curr, lock, ip); curr->lockdep_depth--; @@ -2881,6 +3116,21 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) check_chain_key(curr); } +static int __lock_is_held(struct lockdep_map *lock) +{ + struct task_struct *curr = current; + int i; + + for (i = 0; i < curr->lockdep_depth; i++) { + struct held_lock *hlock = curr->held_locks + i; + + if (match_held_lock(hlock, lock)) + return 1; + } + + return 0; +} + /* * Check whether we follow the irq-flags state precisely: */ @@ -2957,7 +3207,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, current->lockdep_recursion = 1; __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), nest_lock, ip); + irqs_disabled_flags(flags), nest_lock, ip, 0); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } @@ -2982,6 +3232,26 @@ void lock_release(struct lockdep_map *lock, int nested, } EXPORT_SYMBOL_GPL(lock_release); +int lock_is_held(struct lockdep_map *lock) +{ + unsigned long flags; + int ret = 0; + + if (unlikely(current->lockdep_recursion)) + return ret; + + raw_local_irq_save(flags); + check_flags(flags); + + current->lockdep_recursion = 1; + ret = __lock_is_held(lock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(lock_is_held); + void lockdep_set_current_reclaim_state(gfp_t gfp_mask) { current->lockdep_reclaim_gfp = gfp_mask; @@ -3041,7 +3311,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) */ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) break; - if (hlock->instance == lock) + if (match_held_lock(hlock, lock)) goto found_it; prev_hlock = hlock; } @@ -3049,6 +3319,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) return; found_it: + if (hlock->instance != lock) + return; + hlock->waittime_stamp = sched_clock(); contention_point = lock_point(hlock_class(hlock)->contention_point, ip); @@ -3088,7 +3361,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) */ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) break; - if (hlock->instance == lock) + if (match_held_lock(hlock, lock)) goto found_it; prev_hlock = hlock; } @@ -3096,6 +3369,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) return; found_it: + if (hlock->instance != lock) + return; + cpu = smp_processor_id(); if (hlock->waittime_stamp) { now = sched_clock(); @@ -3326,7 +3602,12 @@ void __init lockdep_info(void) sizeof(struct list_head) * CLASSHASH_SIZE + sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + - sizeof(struct list_head) * CHAINHASH_SIZE) / 1024); + sizeof(struct list_head) * CHAINHASH_SIZE +#ifdef CONFIG_PROVE_LOCKING + + sizeof(struct circular_queue) +#endif + ) / 1024 + ); printk(" per task-struct memory footprint: %lu bytes\n", sizeof(struct held_lock) * MAX_LOCK_DEPTH); diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index 699a2ac..a2ee95a 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -91,6 +91,8 @@ extern unsigned int nr_process_chains; extern unsigned int max_lockdep_depth; extern unsigned int max_recursion_depth; +extern unsigned int max_bfs_queue_depth; + #ifdef CONFIG_PROVE_LOCKING extern unsigned long lockdep_count_forward_deps(struct lock_class *); extern unsigned long lockdep_count_backward_deps(struct lock_class *); diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index e94caa6..d4aba4f 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -25,38 +25,12 @@ static void *l_next(struct seq_file *m, void *v, loff_t *pos) { - struct lock_class *class; - - (*pos)++; - - if (v == SEQ_START_TOKEN) - class = m->private; - else { - class = v; - - if (class->lock_entry.next != &all_lock_classes) - class = list_entry(class->lock_entry.next, - struct lock_class, lock_entry); - else - class = NULL; - } - - return class; + return seq_list_next(v, &all_lock_classes, pos); } static void *l_start(struct seq_file *m, loff_t *pos) { - struct lock_class *class; - loff_t i = 0; - - if (*pos == 0) - return SEQ_START_TOKEN; - - list_for_each_entry(class, &all_lock_classes, lock_entry) { - if (++i == *pos) - return class; - } - return NULL; + return seq_list_start_head(&all_lock_classes, *pos); } static void l_stop(struct seq_file *m, void *v) @@ -82,11 +56,11 @@ static void print_name(struct seq_file *m, struct lock_class *class) static int l_show(struct seq_file *m, void *v) { - struct lock_class *class = v; + struct lock_class *class = list_entry(v, struct lock_class, lock_entry); struct lock_list *entry; char usage[LOCK_USAGE_CHARS]; - if (v == SEQ_START_TOKEN) { + if (v == &all_lock_classes) { seq_printf(m, "all lock classes:\n"); return 0; } @@ -128,17 +102,7 @@ static const struct seq_operations lockdep_ops = { static int lockdep_open(struct inode *inode, struct file *file) { - int res = seq_open(file, &lockdep_ops); - if (!res) { - struct seq_file *m = file->private_data; - - if (!list_empty(&all_lock_classes)) - m->private = list_entry(all_lock_classes.next, - struct lock_class, lock_entry); - else - m->private = NULL; - } - return res; + return seq_open(file, &lockdep_ops); } static const struct file_operations proc_lockdep_operations = { @@ -149,37 +113,23 @@ static const struct file_operations proc_lockdep_operations = { }; #ifdef CONFIG_PROVE_LOCKING -static void *lc_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct lock_chain *chain; - - (*pos)++; - - if (v == SEQ_START_TOKEN) - chain = m->private; - else { - chain = v; - - if (*pos < nr_lock_chains) - chain = lock_chains + *pos; - else - chain = NULL; - } - - return chain; -} - static void *lc_start(struct seq_file *m, loff_t *pos) { if (*pos == 0) return SEQ_START_TOKEN; - if (*pos < nr_lock_chains) - return lock_chains + *pos; + if (*pos - 1 < nr_lock_chains) + return lock_chains + (*pos - 1); return NULL; } +static void *lc_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return lc_start(m, pos); +} + static void lc_stop(struct seq_file *m, void *v) { } @@ -220,16 +170,7 @@ static const struct seq_operations lockdep_chains_ops = { static int lockdep_chains_open(struct inode *inode, struct file *file) { - int res = seq_open(file, &lockdep_chains_ops); - if (!res) { - struct seq_file *m = file->private_data; - - if (nr_lock_chains) - m->private = lock_chains; - else - m->private = NULL; - } - return res; + return seq_open(file, &lockdep_chains_ops); } static const struct file_operations proc_lockdep_chains_operations = { @@ -258,16 +199,10 @@ static void lockdep_stats_debug_show(struct seq_file *m) debug_atomic_read(&chain_lookup_hits)); seq_printf(m, " cyclic checks: %11u\n", debug_atomic_read(&nr_cyclic_checks)); - seq_printf(m, " cyclic-check recursions: %11u\n", - debug_atomic_read(&nr_cyclic_check_recursions)); seq_printf(m, " find-mask forwards checks: %11u\n", debug_atomic_read(&nr_find_usage_forwards_checks)); - seq_printf(m, " find-mask forwards recursions: %11u\n", - debug_atomic_read(&nr_find_usage_forwards_recursions)); seq_printf(m, " find-mask backwards checks: %11u\n", debug_atomic_read(&nr_find_usage_backwards_checks)); - seq_printf(m, " find-mask backwards recursions:%11u\n", - debug_atomic_read(&nr_find_usage_backwards_recursions)); seq_printf(m, " hardirq on events: %11u\n", hi1); seq_printf(m, " hardirq off events: %11u\n", hi2); @@ -409,8 +344,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_unused); seq_printf(m, " max locking depth: %11u\n", max_lockdep_depth); - seq_printf(m, " max recursion depth: %11u\n", - max_recursion_depth); +#ifdef CONFIG_PROVE_LOCKING + seq_printf(m, " max bfs queue depth: %11u\n", + max_bfs_queue_depth); +#endif lockdep_stats_debug_show(m); seq_printf(m, " debug_locks: %11u\n", debug_locks); @@ -438,7 +375,6 @@ struct lock_stat_data { }; struct lock_stat_seq { - struct lock_stat_data *iter; struct lock_stat_data *iter_end; struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; }; @@ -626,34 +562,22 @@ static void seq_header(struct seq_file *m) static void *ls_start(struct seq_file *m, loff_t *pos) { struct lock_stat_seq *data = m->private; + struct lock_stat_data *iter; if (*pos == 0) return SEQ_START_TOKEN; - data->iter = data->stats + *pos; - if (data->iter >= data->iter_end) - data->iter = NULL; + iter = data->stats + (*pos - 1); + if (iter >= data->iter_end) + iter = NULL; - return data->iter; + return iter; } static void *ls_next(struct seq_file *m, void *v, loff_t *pos) { - struct lock_stat_seq *data = m->private; - (*pos)++; - - if (v == SEQ_START_TOKEN) - data->iter = data->stats; - else { - data->iter = v; - data->iter++; - } - - if (data->iter == data->iter_end) - data->iter = NULL; - - return data->iter; + return ls_start(m, pos); } static void ls_stop(struct seq_file *m, void *v) @@ -670,7 +594,7 @@ static int ls_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations lockstat_ops = { +static const struct seq_operations lockstat_ops = { .start = ls_start, .next = ls_next, .stop = ls_stop, @@ -691,7 +615,6 @@ static int lock_stat_open(struct inode *inode, struct file *file) struct lock_stat_data *iter = data->stats; struct seq_file *m = file->private_data; - data->iter = iter; list_for_each_entry(class, &all_lock_classes, lock_entry) { iter->class = class; iter->stats = lock_stats(class); @@ -699,7 +622,7 @@ static int lock_stat_open(struct inode *inode, struct file *file) } data->iter_end = iter; - sort(data->stats, data->iter_end - data->iter, + sort(data->stats, data->iter_end - data->stats, sizeof(struct lock_stat_data), lock_stat_cmp, NULL); @@ -734,7 +657,6 @@ static int lock_stat_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; vfree(seq->private); - seq->private = NULL; return seq_release(inode, file); } diff --git a/kernel/marker.c b/kernel/marker.c deleted file mode 100644 index ea54f26..0000000 --- a/kernel/marker.c +++ /dev/null @@ -1,930 +0,0 @@ -/* - * Copyright (C) 2007 Mathieu Desnoyers - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/types.h> -#include <linux/jhash.h> -#include <linux/list.h> -#include <linux/rcupdate.h> -#include <linux/marker.h> -#include <linux/err.h> -#include <linux/slab.h> - -extern struct marker __start___markers[]; -extern struct marker __stop___markers[]; - -/* Set to 1 to enable marker debug output */ -static const int marker_debug; - -/* - * markers_mutex nests inside module_mutex. Markers mutex protects the builtin - * and module markers and the hash table. - */ -static DEFINE_MUTEX(markers_mutex); - -/* - * Marker hash table, containing the active markers. - * Protected by module_mutex. - */ -#define MARKER_HASH_BITS 6 -#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) -static struct hlist_head marker_table[MARKER_TABLE_SIZE]; - -/* - * Note about RCU : - * It is used to make sure every handler has finished using its private data - * between two consecutive operation (add or remove) on a given marker. It is - * also used to delay the free of multiple probes array until a quiescent state - * is reached. - * marker entries modifications are protected by the markers_mutex. - */ -struct marker_entry { - struct hlist_node hlist; - char *format; - /* Probe wrapper */ - void (*call)(const struct marker *mdata, void *call_private, ...); - struct marker_probe_closure single; - struct marker_probe_closure *multi; - int refcount; /* Number of times armed. 0 if disarmed. */ - struct rcu_head rcu; - void *oldptr; - int rcu_pending; - unsigned char ptype:1; - unsigned char format_allocated:1; - char name[0]; /* Contains name'\0'format'\0' */ -}; - -/** - * __mark_empty_function - Empty probe callback - * @probe_private: probe private data - * @call_private: call site private data - * @fmt: format string - * @...: variable argument list - * - * Empty callback provided as a probe to the markers. By providing this to a - * disabled marker, we make sure the execution flow is always valid even - * though the function pointer change and the marker enabling are two distinct - * operations that modifies the execution flow of preemptible code. - */ -notrace void __mark_empty_function(void *probe_private, void *call_private, - const char *fmt, va_list *args) -{ -} -EXPORT_SYMBOL_GPL(__mark_empty_function); - -/* - * marker_probe_cb Callback that prepares the variable argument list for probes. - * @mdata: pointer of type struct marker - * @call_private: caller site private data - * @...: Variable argument list. - * - * Since we do not use "typical" pointer based RCU in the 1 argument case, we - * need to put a full smp_rmb() in this branch. This is why we do not use - * rcu_dereference() for the pointer read. - */ -notrace void marker_probe_cb(const struct marker *mdata, - void *call_private, ...) -{ - va_list args; - char ptype; - - /* - * rcu_read_lock_sched does two things : disabling preemption to make - * sure the teardown of the callbacks can be done correctly when they - * are in modules and they insure RCU read coherency. - */ - rcu_read_lock_sched_notrace(); - ptype = mdata->ptype; - if (likely(!ptype)) { - marker_probe_func *func; - /* Must read the ptype before ptr. They are not data dependant, - * so we put an explicit smp_rmb() here. */ - smp_rmb(); - func = mdata->single.func; - /* Must read the ptr before private data. They are not data - * dependant, so we put an explicit smp_rmb() here. */ - smp_rmb(); - va_start(args, call_private); - func(mdata->single.probe_private, call_private, mdata->format, - &args); - va_end(args); - } else { - struct marker_probe_closure *multi; - int i; - /* - * Read mdata->ptype before mdata->multi. - */ - smp_rmb(); - multi = mdata->multi; - /* - * multi points to an array, therefore accessing the array - * depends on reading multi. However, even in this case, - * we must insure that the pointer is read _before_ the array - * data. Same as rcu_dereference, but we need a full smp_rmb() - * in the fast path, so put the explicit barrier here. - */ - smp_read_barrier_depends(); - for (i = 0; multi[i].func; i++) { - va_start(args, call_private); - multi[i].func(multi[i].probe_private, call_private, - mdata->format, &args); - va_end(args); - } - } - rcu_read_unlock_sched_notrace(); -} -EXPORT_SYMBOL_GPL(marker_probe_cb); - -/* - * marker_probe_cb Callback that does not prepare the variable argument list. - * @mdata: pointer of type struct marker - * @call_private: caller site private data - * @...: Variable argument list. - * - * Should be connected to markers "MARK_NOARGS". - */ -static notrace void marker_probe_cb_noarg(const struct marker *mdata, - void *call_private, ...) -{ - va_list args; /* not initialized */ - char ptype; - - rcu_read_lock_sched_notrace(); - ptype = mdata->ptype; - if (likely(!ptype)) { - marker_probe_func *func; - /* Must read the ptype before ptr. They are not data dependant, - * so we put an explicit smp_rmb() here. */ - smp_rmb(); - func = mdata->single.func; - /* Must read the ptr before private data. They are not data - * dependant, so we put an explicit smp_rmb() here. */ - smp_rmb(); - func(mdata->single.probe_private, call_private, mdata->format, - &args); - } else { - struct marker_probe_closure *multi; - int i; - /* - * Read mdata->ptype before mdata->multi. - */ - smp_rmb(); - multi = mdata->multi; - /* - * multi points to an array, therefore accessing the array - * depends on reading multi. However, even in this case, - * we must insure that the pointer is read _before_ the array - * data. Same as rcu_dereference, but we need a full smp_rmb() - * in the fast path, so put the explicit barrier here. - */ - smp_read_barrier_depends(); - for (i = 0; multi[i].func; i++) - multi[i].func(multi[i].probe_private, call_private, - mdata->format, &args); - } - rcu_read_unlock_sched_notrace(); -} - -static void free_old_closure(struct rcu_head *head) -{ - struct marker_entry *entry = container_of(head, - struct marker_entry, rcu); - kfree(entry->oldptr); - /* Make sure we free the data before setting the pending flag to 0 */ - smp_wmb(); - entry->rcu_pending = 0; -} - -static void debug_print_probes(struct marker_entry *entry) -{ - int i; - - if (!marker_debug) - return; - - if (!entry->ptype) { - printk(KERN_DEBUG "Single probe : %p %p\n", - entry->single.func, - entry->single.probe_private); - } else { - for (i = 0; entry->multi[i].func; i++) - printk(KERN_DEBUG "Multi probe %d : %p %p\n", i, - entry->multi[i].func, - entry->multi[i].probe_private); - } -} - -static struct marker_probe_closure * -marker_entry_add_probe(struct marker_entry *entry, - marker_probe_func *probe, void *probe_private) -{ - int nr_probes = 0; - struct marker_probe_closure *old, *new; - - WARN_ON(!probe); - - debug_print_probes(entry); - old = entry->multi; - if (!entry->ptype) { - if (entry->single.func == probe && - entry->single.probe_private == probe_private) - return ERR_PTR(-EBUSY); - if (entry->single.func == __mark_empty_function) { - /* 0 -> 1 probes */ - entry->single.func = probe; - entry->single.probe_private = probe_private; - entry->refcount = 1; - entry->ptype = 0; - debug_print_probes(entry); - return NULL; - } else { - /* 1 -> 2 probes */ - nr_probes = 1; - old = NULL; - } - } else { - /* (N -> N+1), (N != 0, 1) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) - if (old[nr_probes].func == probe - && old[nr_probes].probe_private - == probe_private) - return ERR_PTR(-EBUSY); - } - /* + 2 : one for new probe, one for NULL func */ - new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure), - GFP_KERNEL); - if (new == NULL) - return ERR_PTR(-ENOMEM); - if (!old) - new[0] = entry->single; - else - memcpy(new, old, - nr_probes * sizeof(struct marker_probe_closure)); - new[nr_probes].func = probe; - new[nr_probes].probe_private = probe_private; - entry->refcount = nr_probes + 1; - entry->multi = new; - entry->ptype = 1; - debug_print_probes(entry); - return old; -} - -static struct marker_probe_closure * -marker_entry_remove_probe(struct marker_entry *entry, - marker_probe_func *probe, void *probe_private) -{ - int nr_probes = 0, nr_del = 0, i; - struct marker_probe_closure *old, *new; - - old = entry->multi; - - debug_print_probes(entry); - if (!entry->ptype) { - /* 0 -> N is an error */ - WARN_ON(entry->single.func == __mark_empty_function); - /* 1 -> 0 probes */ - WARN_ON(probe && entry->single.func != probe); - WARN_ON(entry->single.probe_private != probe_private); - entry->single.func = __mark_empty_function; - entry->refcount = 0; - entry->ptype = 0; - debug_print_probes(entry); - return NULL; - } else { - /* (N -> M), (N > 1, M >= 0) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) { - if ((!probe || old[nr_probes].func == probe) - && old[nr_probes].probe_private - == probe_private) - nr_del++; - } - } - - if (nr_probes - nr_del == 0) { - /* N -> 0, (N > 1) */ - entry->single.func = __mark_empty_function; - entry->refcount = 0; - entry->ptype = 0; - } else if (nr_probes - nr_del == 1) { - /* N -> 1, (N > 1) */ - for (i = 0; old[i].func; i++) - if ((probe && old[i].func != probe) || - old[i].probe_private != probe_private) - entry->single = old[i]; - entry->refcount = 1; - entry->ptype = 0; - } else { - int j = 0; - /* N -> M, (N > 1, M > 1) */ - /* + 1 for NULL */ - new = kzalloc((nr_probes - nr_del + 1) - * sizeof(struct marker_probe_closure), GFP_KERNEL); - if (new == NULL) - return ERR_PTR(-ENOMEM); - for (i = 0; old[i].func; i++) - if ((probe && old[i].func != probe) || - old[i].probe_private != probe_private) - new[j++] = old[i]; - entry->refcount = nr_probes - nr_del; - entry->ptype = 1; - entry->multi = new; - } - debug_print_probes(entry); - return old; -} - -/* - * Get marker if the marker is present in the marker hash table. - * Must be called with markers_mutex held. - * Returns NULL if not present. - */ -static struct marker_entry *get_marker(const char *name) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - u32 hash = jhash(name, strlen(name), 0); - - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) - return e; - } - return NULL; -} - -/* - * Add the marker to the marker hash table. Must be called with markers_mutex - * held. - */ -static struct marker_entry *add_marker(const char *name, const char *format) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - size_t name_len = strlen(name) + 1; - size_t format_len = 0; - u32 hash = jhash(name, name_len-1, 0); - - if (format) - format_len = strlen(format) + 1; - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - printk(KERN_NOTICE - "Marker %s busy\n", name); - return ERR_PTR(-EBUSY); /* Already there */ - } - } - /* - * Using kmalloc here to allocate a variable length element. Could - * cause some memory fragmentation if overused. - */ - e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, - GFP_KERNEL); - if (!e) - return ERR_PTR(-ENOMEM); - memcpy(&e->name[0], name, name_len); - if (format) { - e->format = &e->name[name_len]; - memcpy(e->format, format, format_len); - if (strcmp(e->format, MARK_NOARGS) == 0) - e->call = marker_probe_cb_noarg; - else - e->call = marker_probe_cb; - trace_mark(core_marker_format, "name %s format %s", - e->name, e->format); - } else { - e->format = NULL; - e->call = marker_probe_cb; - } - e->single.func = __mark_empty_function; - e->single.probe_private = NULL; - e->multi = NULL; - e->ptype = 0; - e->format_allocated = 0; - e->refcount = 0; - e->rcu_pending = 0; - hlist_add_head(&e->hlist, head); - return e; -} - -/* - * Remove the marker from the marker hash table. Must be called with mutex_lock - * held. - */ -static int remove_marker(const char *name) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - int found = 0; - size_t len = strlen(name) + 1; - u32 hash = jhash(name, len-1, 0); - - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - found = 1; - break; - } - } - if (!found) - return -ENOENT; - if (e->single.func != __mark_empty_function) - return -EBUSY; - hlist_del(&e->hlist); - if (e->format_allocated) - kfree(e->format); - /* Make sure the call_rcu has been executed */ - if (e->rcu_pending) - rcu_barrier_sched(); - kfree(e); - return 0; -} - -/* - * Set the mark_entry format to the format found in the element. - */ -static int marker_set_format(struct marker_entry *entry, const char *format) -{ - entry->format = kstrdup(format, GFP_KERNEL); - if (!entry->format) - return -ENOMEM; - entry->format_allocated = 1; - - trace_mark(core_marker_format, "name %s format %s", - entry->name, entry->format); - return 0; -} - -/* - * Sets the probe callback corresponding to one marker. - */ -static int set_marker(struct marker_entry *entry, struct marker *elem, - int active) -{ - int ret = 0; - WARN_ON(strcmp(entry->name, elem->name) != 0); - - if (entry->format) { - if (strcmp(entry->format, elem->format) != 0) { - printk(KERN_NOTICE - "Format mismatch for probe %s " - "(%s), marker (%s)\n", - entry->name, - entry->format, - elem->format); - return -EPERM; - } - } else { - ret = marker_set_format(entry, elem->format); - if (ret) - return ret; - } - - /* - * probe_cb setup (statically known) is done here. It is - * asynchronous with the rest of execution, therefore we only - * pass from a "safe" callback (with argument) to an "unsafe" - * callback (does not set arguments). - */ - elem->call = entry->call; - /* - * Sanity check : - * We only update the single probe private data when the ptr is - * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) - */ - WARN_ON(elem->single.func != __mark_empty_function - && elem->single.probe_private != entry->single.probe_private - && !elem->ptype); - elem->single.probe_private = entry->single.probe_private; - /* - * Make sure the private data is valid when we update the - * single probe ptr. - */ - smp_wmb(); - elem->single.func = entry->single.func; - /* - * We also make sure that the new probe callbacks array is consistent - * before setting a pointer to it. - */ - rcu_assign_pointer(elem->multi, entry->multi); - /* - * Update the function or multi probe array pointer before setting the - * ptype. - */ - smp_wmb(); - elem->ptype = entry->ptype; - - if (elem->tp_name && (active ^ elem->state)) { - WARN_ON(!elem->tp_cb); - /* - * It is ok to directly call the probe registration because type - * checking has been done in the __trace_mark_tp() macro. - */ - - if (active) { - /* - * try_module_get should always succeed because we hold - * lock_module() to get the tp_cb address. - */ - ret = try_module_get(__module_text_address( - (unsigned long)elem->tp_cb)); - BUG_ON(!ret); - ret = tracepoint_probe_register_noupdate( - elem->tp_name, - elem->tp_cb); - } else { - ret = tracepoint_probe_unregister_noupdate( - elem->tp_name, - elem->tp_cb); - /* - * tracepoint_probe_update_all() must be called - * before the module containing tp_cb is unloaded. - */ - module_put(__module_text_address( - (unsigned long)elem->tp_cb)); - } - } - elem->state = active; - - return ret; -} - -/* - * Disable a marker and its probe callback. - * Note: only waiting an RCU period after setting elem->call to the empty - * function insures that the original callback is not used anymore. This insured - * by rcu_read_lock_sched around the call site. - */ -static void disable_marker(struct marker *elem) -{ - int ret; - - /* leave "call" as is. It is known statically. */ - if (elem->tp_name && elem->state) { - WARN_ON(!elem->tp_cb); - /* - * It is ok to directly call the probe registration because type - * checking has been done in the __trace_mark_tp() macro. - */ - ret = tracepoint_probe_unregister_noupdate(elem->tp_name, - elem->tp_cb); - WARN_ON(ret); - /* - * tracepoint_probe_update_all() must be called - * before the module containing tp_cb is unloaded. - */ - module_put(__module_text_address((unsigned long)elem->tp_cb)); - } - elem->state = 0; - elem->single.func = __mark_empty_function; - /* Update the function before setting the ptype */ - smp_wmb(); - elem->ptype = 0; /* single probe */ - /* - * Leave the private data and id there, because removal is racy and - * should be done only after an RCU period. These are never used until - * the next initialization anyway. - */ -} - -/** - * marker_update_probe_range - Update a probe range - * @begin: beginning of the range - * @end: end of the range - * - * Updates the probe callback corresponding to a range of markers. - */ -void marker_update_probe_range(struct marker *begin, - struct marker *end) -{ - struct marker *iter; - struct marker_entry *mark_entry; - - mutex_lock(&markers_mutex); - for (iter = begin; iter < end; iter++) { - mark_entry = get_marker(iter->name); - if (mark_entry) { - set_marker(mark_entry, iter, !!mark_entry->refcount); - /* - * ignore error, continue - */ - } else { - disable_marker(iter); - } - } - mutex_unlock(&markers_mutex); -} - -/* - * Update probes, removing the faulty probes. - * - * Internal callback only changed before the first probe is connected to it. - * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 - * transitions. All other transitions will leave the old private data valid. - * This makes the non-atomicity of the callback/private data updates valid. - * - * "special case" updates : - * 0 -> 1 callback - * 1 -> 0 callback - * 1 -> 2 callbacks - * 2 -> 1 callbacks - * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates. - * Site effect : marker_set_format may delete the marker entry (creating a - * replacement). - */ -static void marker_update_probes(void) -{ - /* Core kernel markers */ - marker_update_probe_range(__start___markers, __stop___markers); - /* Markers in modules. */ - module_update_markers(); - tracepoint_probe_update_all(); -} - -/** - * marker_probe_register - Connect a probe to a marker - * @name: marker name - * @format: format string - * @probe: probe handler - * @probe_private: probe private data - * - * private data must be a valid allocated memory address, or NULL. - * Returns 0 if ok, error value on error. - * The probe address must at least be aligned on the architecture pointer size. - */ -int marker_probe_register(const char *name, const char *format, - marker_probe_func *probe, void *probe_private) -{ - struct marker_entry *entry; - int ret = 0; - struct marker_probe_closure *old; - - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) { - entry = add_marker(name, format); - if (IS_ERR(entry)) - ret = PTR_ERR(entry); - } else if (format) { - if (!entry->format) - ret = marker_set_format(entry, format); - else if (strcmp(entry->format, format)) - ret = -EPERM; - } - if (ret) - goto end; - - /* - * If we detect that a call_rcu is pending for this marker, - * make sure it's executed now. - */ - if (entry->rcu_pending) - rcu_barrier_sched(); - old = marker_entry_add_probe(entry, probe, probe_private); - if (IS_ERR(old)) { - ret = PTR_ERR(old); - goto end; - } - mutex_unlock(&markers_mutex); - marker_update_probes(); - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); -end: - mutex_unlock(&markers_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(marker_probe_register); - -/** - * marker_probe_unregister - Disconnect a probe from a marker - * @name: marker name - * @probe: probe function pointer - * @probe_private: probe private data - * - * Returns the private data given to marker_probe_register, or an ERR_PTR(). - * We do not need to call a synchronize_sched to make sure the probes have - * finished running before doing a module unload, because the module unload - * itself uses stop_machine(), which insures that every preempt disabled section - * have finished. - */ -int marker_probe_unregister(const char *name, - marker_probe_func *probe, void *probe_private) -{ - struct marker_entry *entry; - struct marker_probe_closure *old; - int ret = -ENOENT; - - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - old = marker_entry_remove_probe(entry, probe, probe_private); - mutex_unlock(&markers_mutex); - marker_update_probes(); - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); - remove_marker(name); /* Ignore busy error message */ - ret = 0; -end: - mutex_unlock(&markers_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(marker_probe_unregister); - -static struct marker_entry * -get_marker_from_private_data(marker_probe_func *probe, void *probe_private) -{ - struct marker_entry *entry; - unsigned int i; - struct hlist_head *head; - struct hlist_node *node; - - for (i = 0; i < MARKER_TABLE_SIZE; i++) { - head = &marker_table[i]; - hlist_for_each_entry(entry, node, head, hlist) { - if (!entry->ptype) { - if (entry->single.func == probe - && entry->single.probe_private - == probe_private) - return entry; - } else { - struct marker_probe_closure *closure; - closure = entry->multi; - for (i = 0; closure[i].func; i++) { - if (closure[i].func == probe && - closure[i].probe_private - == probe_private) - return entry; - } - } - } - } - return NULL; -} - -/** - * marker_probe_unregister_private_data - Disconnect a probe from a marker - * @probe: probe function - * @probe_private: probe private data - * - * Unregister a probe by providing the registered private data. - * Only removes the first marker found in hash table. - * Return 0 on success or error value. - * We do not need to call a synchronize_sched to make sure the probes have - * finished running before doing a module unload, because the module unload - * itself uses stop_machine(), which insures that every preempt disabled section - * have finished. - */ -int marker_probe_unregister_private_data(marker_probe_func *probe, - void *probe_private) -{ - struct marker_entry *entry; - int ret = 0; - struct marker_probe_closure *old; - - mutex_lock(&markers_mutex); - entry = get_marker_from_private_data(probe, probe_private); - if (!entry) { - ret = -ENOENT; - goto end; - } - if (entry->rcu_pending) - rcu_barrier_sched(); - old = marker_entry_remove_probe(entry, NULL, probe_private); - mutex_unlock(&markers_mutex); - marker_update_probes(); - mutex_lock(&markers_mutex); - entry = get_marker_from_private_data(probe, probe_private); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); - remove_marker(entry->name); /* Ignore busy error message */ -end: - mutex_unlock(&markers_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); - -/** - * marker_get_private_data - Get a marker's probe private data - * @name: marker name - * @probe: probe to match - * @num: get the nth matching probe's private data - * - * Returns the nth private data pointer (starting from 0) matching, or an - * ERR_PTR. - * Returns the private data pointer, or an ERR_PTR. - * The private data pointer should _only_ be dereferenced if the caller is the - * owner of the data, or its content could vanish. This is mostly used to - * confirm that a caller is the owner of a registered probe. - */ -void *marker_get_private_data(const char *name, marker_probe_func *probe, - int num) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - size_t name_len = strlen(name) + 1; - u32 hash = jhash(name, name_len-1, 0); - int i; - - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - if (!e->ptype) { - if (num == 0 && e->single.func == probe) - return e->single.probe_private; - } else { - struct marker_probe_closure *closure; - int match = 0; - closure = e->multi; - for (i = 0; closure[i].func; i++) { - if (closure[i].func != probe) - continue; - if (match++ == num) - return closure[i].probe_private; - } - } - break; - } - } - return ERR_PTR(-ENOENT); -} -EXPORT_SYMBOL_GPL(marker_get_private_data); - -#ifdef CONFIG_MODULES - -int marker_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; - - switch (val) { - case MODULE_STATE_COMING: - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); - break; - case MODULE_STATE_GOING: - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); - break; - } - return 0; -} - -struct notifier_block marker_module_nb = { - .notifier_call = marker_module_notify, - .priority = 0, -}; - -static int init_markers(void) -{ - return register_module_notifier(&marker_module_nb); -} -__initcall(init_markers); - -#endif /* CONFIG_MODULES */ diff --git a/kernel/module.c b/kernel/module.c index 46580ed..e6bc4b2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -47,6 +47,7 @@ #include <linux/rculist.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> +#include <asm/mmu_context.h> #include <linux/license.h> #include <asm/sections.h> #include <linux/tracepoint.h> @@ -369,7 +370,7 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA static void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) @@ -394,7 +395,7 @@ static void percpu_modfree(void *freeme) free_percpu(freeme); } -#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */ /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; @@ -540,7 +541,7 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, @@ -1535,6 +1536,10 @@ static void free_module(struct module *mod) /* Finally, free the core (containing the module structure) */ module_free(mod, mod->module_core); + +#ifdef CONFIG_MPU + update_protections(current->mm); +#endif } void *__symbol_get(const char *symbol) @@ -2237,10 +2242,6 @@ static noinline struct module *load_module(void __user *umod, sizeof(*mod->ctors), &mod->num_ctors); #endif -#ifdef CONFIG_MARKERS - mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers", - sizeof(*mod->markers), &mod->num_markers); -#endif #ifdef CONFIG_TRACEPOINTS mod->tracepoints = section_objs(hdr, sechdrs, secstrings, "__tracepoints", @@ -2958,20 +2959,6 @@ void module_layout(struct module *mod, EXPORT_SYMBOL(module_layout); #endif -#ifdef CONFIG_MARKERS -void module_update_markers(void) -{ - struct module *mod; - - mutex_lock(&module_mutex); - list_for_each_entry(mod, &modules, list) - if (!mod->taints) - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); - mutex_unlock(&module_mutex); -} -#endif - #ifdef CONFIG_TRACEPOINTS void module_update_tracepoints(void) { diff --git a/kernel/panic.c b/kernel/panic.c index 512ab73..bcdef26 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -177,7 +177,7 @@ static const struct tnt tnts[] = { * 'W' - Taint on warning. * 'C' - modules from drivers/staging are loaded. * - * The string is overwritten by the next call to print_taint(). + * The string is overwritten by the next call to print_tainted(). */ const char *print_tainted(void) { diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c deleted file mode 100644 index d7cbc57..0000000 --- a/kernel/perf_counter.c +++ /dev/null @@ -1,4861 +0,0 @@ -/* - * Performance counter core code - * - * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> - * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * - * For licensing details see kernel-base/COPYING - */ - -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/cpu.h> -#include <linux/smp.h> -#include <linux/file.h> -#include <linux/poll.h> -#include <linux/sysfs.h> -#include <linux/dcache.h> -#include <linux/percpu.h> -#include <linux/ptrace.h> -#include <linux/vmstat.h> -#include <linux/hardirq.h> -#include <linux/rculist.h> -#include <linux/uaccess.h> -#include <linux/syscalls.h> -#include <linux/anon_inodes.h> -#include <linux/kernel_stat.h> -#include <linux/perf_counter.h> - -#include <asm/irq_regs.h> - -/* - * Each CPU has a list of per CPU counters: - */ -DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); - -int perf_max_counters __read_mostly = 1; -static int perf_reserved_percpu __read_mostly; -static int perf_overcommit __read_mostly = 1; - -static atomic_t nr_counters __read_mostly; -static atomic_t nr_mmap_counters __read_mostly; -static atomic_t nr_comm_counters __read_mostly; -static atomic_t nr_task_counters __read_mostly; - -/* - * perf counter paranoia level: - * 0 - not paranoid - * 1 - disallow cpu counters to unpriv - * 2 - disallow kernel profiling to unpriv - */ -int sysctl_perf_counter_paranoid __read_mostly = 1; - -static inline bool perf_paranoid_cpu(void) -{ - return sysctl_perf_counter_paranoid > 0; -} - -static inline bool perf_paranoid_kernel(void) -{ - return sysctl_perf_counter_paranoid > 1; -} - -int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ - -/* - * max perf counter sample rate - */ -int sysctl_perf_counter_sample_rate __read_mostly = 100000; - -static atomic64_t perf_counter_id; - -/* - * Lock for (sysadmin-configurable) counter reservations: - */ -static DEFINE_SPINLOCK(perf_resource_lock); - -/* - * Architecture provided APIs - weak aliases: - */ -extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -{ - return NULL; -} - -void __weak hw_perf_disable(void) { barrier(); } -void __weak hw_perf_enable(void) { barrier(); } - -void __weak hw_perf_counter_setup(int cpu) { barrier(); } -void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } - -int __weak -hw_perf_group_sched_in(struct perf_counter *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, int cpu) -{ - return 0; -} - -void __weak perf_counter_print_debug(void) { } - -static DEFINE_PER_CPU(int, disable_count); - -void __perf_disable(void) -{ - __get_cpu_var(disable_count)++; -} - -bool __perf_enable(void) -{ - return !--__get_cpu_var(disable_count); -} - -void perf_disable(void) -{ - __perf_disable(); - hw_perf_disable(); -} - -void perf_enable(void) -{ - if (__perf_enable()) - hw_perf_enable(); -} - -static void get_ctx(struct perf_counter_context *ctx) -{ - WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); -} - -static void free_ctx(struct rcu_head *head) -{ - struct perf_counter_context *ctx; - - ctx = container_of(head, struct perf_counter_context, rcu_head); - kfree(ctx); -} - -static void put_ctx(struct perf_counter_context *ctx) -{ - if (atomic_dec_and_test(&ctx->refcount)) { - if (ctx->parent_ctx) - put_ctx(ctx->parent_ctx); - if (ctx->task) - put_task_struct(ctx->task); - call_rcu(&ctx->rcu_head, free_ctx); - } -} - -static void unclone_ctx(struct perf_counter_context *ctx) -{ - if (ctx->parent_ctx) { - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; - } -} - -/* - * If we inherit counters we want to return the parent counter id - * to userspace. - */ -static u64 primary_counter_id(struct perf_counter *counter) -{ - u64 id = counter->id; - - if (counter->parent) - id = counter->parent->id; - - return id; -} - -/* - * Get the perf_counter_context for a task and lock it. - * This has to cope with with the fact that until it is locked, - * the context could get moved to another task. - */ -static struct perf_counter_context * -perf_lock_task_context(struct task_struct *task, unsigned long *flags) -{ - struct perf_counter_context *ctx; - - rcu_read_lock(); - retry: - ctx = rcu_dereference(task->perf_counter_ctxp); - if (ctx) { - /* - * If this context is a clone of another, it might - * get swapped for another underneath us by - * perf_counter_task_sched_out, though the - * rcu_read_lock() protects us from any context - * getting freed. Lock the context and check if it - * got swapped before we could get the lock, and retry - * if so. If we locked the right context, then it - * can't get swapped on us any more. - */ - spin_lock_irqsave(&ctx->lock, *flags); - if (ctx != rcu_dereference(task->perf_counter_ctxp)) { - spin_unlock_irqrestore(&ctx->lock, *flags); - goto retry; - } - - if (!atomic_inc_not_zero(&ctx->refcount)) { - spin_unlock_irqrestore(&ctx->lock, *flags); - ctx = NULL; - } - } - rcu_read_unlock(); - return ctx; -} - -/* - * Get the context for a task and increment its pin_count so it - * can't get swapped to another task. This also increments its - * reference count so that the context can't get freed. - */ -static struct perf_counter_context *perf_pin_task_context(struct task_struct *task) -{ - struct perf_counter_context *ctx; - unsigned long flags; - - ctx = perf_lock_task_context(task, &flags); - if (ctx) { - ++ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); - } - return ctx; -} - -static void perf_unpin_context(struct perf_counter_context *ctx) -{ - unsigned long flags; - - spin_lock_irqsave(&ctx->lock, flags); - --ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); - put_ctx(ctx); -} - -/* - * Add a counter from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) -{ - struct perf_counter *group_leader = counter->group_leader; - - /* - * Depending on whether it is a standalone or sibling counter, - * add it straight to the context's counter list, or to the group - * leader's sibling list: - */ - if (group_leader == counter) - list_add_tail(&counter->list_entry, &ctx->counter_list); - else { - list_add_tail(&counter->list_entry, &group_leader->sibling_list); - group_leader->nr_siblings++; - } - - list_add_rcu(&counter->event_entry, &ctx->event_list); - ctx->nr_counters++; - if (counter->attr.inherit_stat) - ctx->nr_stat++; -} - -/* - * Remove a counter from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) -{ - struct perf_counter *sibling, *tmp; - - if (list_empty(&counter->list_entry)) - return; - ctx->nr_counters--; - if (counter->attr.inherit_stat) - ctx->nr_stat--; - - list_del_init(&counter->list_entry); - list_del_rcu(&counter->event_entry); - - if (counter->group_leader != counter) - counter->group_leader->nr_siblings--; - - /* - * If this was a group counter with sibling counters then - * upgrade the siblings to singleton counters by adding them - * to the context list directly: - */ - list_for_each_entry_safe(sibling, tmp, - &counter->sibling_list, list_entry) { - - list_move_tail(&sibling->list_entry, &ctx->counter_list); - sibling->group_leader = sibling; - } -} - -static void -counter_sched_out(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) -{ - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return; - - counter->state = PERF_COUNTER_STATE_INACTIVE; - if (counter->pending_disable) { - counter->pending_disable = 0; - counter->state = PERF_COUNTER_STATE_OFF; - } - counter->tstamp_stopped = ctx->time; - counter->pmu->disable(counter); - counter->oncpu = -1; - - if (!is_software_counter(counter)) - cpuctx->active_oncpu--; - ctx->nr_active--; - if (counter->attr.exclusive || !cpuctx->active_oncpu) - cpuctx->exclusive = 0; -} - -static void -group_sched_out(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - - if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) - return; - - counter_sched_out(group_counter, cpuctx, ctx); - - /* - * Schedule out siblings (if any): - */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) - counter_sched_out(counter, cpuctx, ctx); - - if (group_counter->attr.exclusive) - cpuctx->exclusive = 0; -} - -/* - * Cross CPU call to remove a performance counter - * - * We disable the counter on the hardware level first. After that we - * remove it from the context list. - */ -static void __perf_counter_remove_from_context(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - spin_lock(&ctx->lock); - /* - * Protect the list operation against NMI by disabling the - * counters on a global level. - */ - perf_disable(); - - counter_sched_out(counter, cpuctx, ctx); - - list_del_counter(counter, ctx); - - if (!ctx->task) { - /* - * Allow more per task counters with respect to the - * reservation: - */ - cpuctx->max_pertask = - min(perf_max_counters - ctx->nr_counters, - perf_max_counters - perf_reserved_percpu); - } - - perf_enable(); - spin_unlock(&ctx->lock); -} - - -/* - * Remove the counter from a task's (or a CPU's) list of counters. - * - * Must be called with ctx->mutex held. - * - * CPU counters are removed with a smp call. For task counters we only - * call when the task is on a CPU. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This is OK when called from perf_release since - * that only calls us on the top-level context, which can't be a clone. - * When called from perf_counter_exit_task, it's OK because the - * context has been detached from its task. - */ -static void perf_counter_remove_from_context(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Per cpu counters are removed via an smp call and - * the removal is always sucessful. - */ - smp_call_function_single(counter->cpu, - __perf_counter_remove_from_context, - counter, 1); - return; - } - -retry: - task_oncpu_function_call(task, __perf_counter_remove_from_context, - counter); - - spin_lock_irq(&ctx->lock); - /* - * If the context is active we need to retry the smp call. - */ - if (ctx->nr_active && !list_empty(&counter->list_entry)) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * The lock prevents that this context is scheduled in so we - * can remove the counter safely, if the call above did not - * succeed. - */ - if (!list_empty(&counter->list_entry)) { - list_del_counter(counter, ctx); - } - spin_unlock_irq(&ctx->lock); -} - -static inline u64 perf_clock(void) -{ - return cpu_clock(smp_processor_id()); -} - -/* - * Update the record of the current time in a context. - */ -static void update_context_time(struct perf_counter_context *ctx) -{ - u64 now = perf_clock(); - - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; -} - -/* - * Update the total_time_enabled and total_time_running fields for a counter. - */ -static void update_counter_times(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - u64 run_end; - - if (counter->state < PERF_COUNTER_STATE_INACTIVE) - return; - - counter->total_time_enabled = ctx->time - counter->tstamp_enabled; - - if (counter->state == PERF_COUNTER_STATE_INACTIVE) - run_end = counter->tstamp_stopped; - else - run_end = ctx->time; - - counter->total_time_running = run_end - counter->tstamp_running; -} - -/* - * Update total_time_enabled and total_time_running for all counters in a group. - */ -static void update_group_times(struct perf_counter *leader) -{ - struct perf_counter *counter; - - update_counter_times(leader); - list_for_each_entry(counter, &leader->sibling_list, list_entry) - update_counter_times(counter); -} - -/* - * Cross CPU call to disable a performance counter - */ -static void __perf_counter_disable(void *info) -{ - struct perf_counter *counter = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = counter->ctx; - - /* - * If this is a per-task counter, need to check whether this - * counter's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - spin_lock(&ctx->lock); - - /* - * If the counter is on, turn it off. - * If it is in error state, leave it in error state. - */ - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { - update_context_time(ctx); - update_counter_times(counter); - if (counter == counter->group_leader) - group_sched_out(counter, cpuctx, ctx); - else - counter_sched_out(counter, cpuctx, ctx); - counter->state = PERF_COUNTER_STATE_OFF; - } - - spin_unlock(&ctx->lock); -} - -/* - * Disable a counter. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This condition is satisifed when called through - * perf_counter_for_each_child or perf_counter_for_each because they - * hold the top-level counter's child_mutex, so any descendant that - * goes to exit will block in sync_child_counter. - * When called from perf_pending_counter it's OK because counter->ctx - * is the current context on this CPU and preemption is disabled, - * hence we can't get into perf_counter_task_sched_out for this context. - */ -static void perf_counter_disable(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Disable the counter on the cpu that it's on - */ - smp_call_function_single(counter->cpu, __perf_counter_disable, - counter, 1); - return; - } - - retry: - task_oncpu_function_call(task, __perf_counter_disable, counter); - - spin_lock_irq(&ctx->lock); - /* - * If the counter is still active, we need to retry the cross-call. - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_counter_times(counter); - counter->state = PERF_COUNTER_STATE_OFF; - } - - spin_unlock_irq(&ctx->lock); -} - -static int -counter_sched_in(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) -{ - if (counter->state <= PERF_COUNTER_STATE_OFF) - return 0; - - counter->state = PERF_COUNTER_STATE_ACTIVE; - counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ - /* - * The new state must be visible before we turn it on in the hardware: - */ - smp_wmb(); - - if (counter->pmu->enable(counter)) { - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->oncpu = -1; - return -EAGAIN; - } - - counter->tstamp_running += ctx->time - counter->tstamp_stopped; - - if (!is_software_counter(counter)) - cpuctx->active_oncpu++; - ctx->nr_active++; - - if (counter->attr.exclusive) - cpuctx->exclusive = 1; - - return 0; -} - -static int -group_sched_in(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) -{ - struct perf_counter *counter, *partial_group; - int ret; - - if (group_counter->state == PERF_COUNTER_STATE_OFF) - return 0; - - ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); - if (ret) - return ret < 0 ? ret : 0; - - if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) - return -EAGAIN; - - /* - * Schedule in siblings as one group (if any): - */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - if (counter_sched_in(counter, cpuctx, ctx, cpu)) { - partial_group = counter; - goto group_error; - } - } - - return 0; - -group_error: - /* - * Groups can be scheduled in as one unit only, so undo any - * partial group before returning: - */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - if (counter == partial_group) - break; - counter_sched_out(counter, cpuctx, ctx); - } - counter_sched_out(group_counter, cpuctx, ctx); - - return -EAGAIN; -} - -/* - * Return 1 for a group consisting entirely of software counters, - * 0 if the group contains any hardware counters. - */ -static int is_software_only_group(struct perf_counter *leader) -{ - struct perf_counter *counter; - - if (!is_software_counter(leader)) - return 0; - - list_for_each_entry(counter, &leader->sibling_list, list_entry) - if (!is_software_counter(counter)) - return 0; - - return 1; -} - -/* - * Work out whether we can put this counter group on the CPU now. - */ -static int group_can_go_on(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - int can_add_hw) -{ - /* - * Groups consisting entirely of software counters can always go on. - */ - if (is_software_only_group(counter)) - return 1; - /* - * If an exclusive group is already on, no other hardware - * counters can go on. - */ - if (cpuctx->exclusive) - return 0; - /* - * If this group is exclusive and there are already - * counters on the CPU, it can't go on. - */ - if (counter->attr.exclusive && cpuctx->active_oncpu) - return 0; - /* - * Otherwise, try to add it if all previous groups were able - * to go on. - */ - return can_add_hw; -} - -static void add_counter_to_ctx(struct perf_counter *counter, - struct perf_counter_context *ctx) -{ - list_add_counter(counter, ctx); - counter->tstamp_enabled = ctx->time; - counter->tstamp_running = ctx->time; - counter->tstamp_stopped = ctx->time; -} - -/* - * Cross CPU call to install and enable a performance counter - * - * Must be called with ctx->mutex held - */ -static void __perf_install_in_context(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *leader = counter->group_leader; - int cpu = smp_processor_id(); - int err; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - * Or possibly this is the right context but it isn't - * on this cpu because it had no counters. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } - - spin_lock(&ctx->lock); - ctx->is_active = 1; - update_context_time(ctx); - - /* - * Protect the list operation against NMI by disabling the - * counters on a global level. NOP for non NMI based counters. - */ - perf_disable(); - - add_counter_to_ctx(counter, ctx); - - /* - * Don't put the counter on if it is disabled or if - * it is in a group and the group isn't on. - */ - if (counter->state != PERF_COUNTER_STATE_INACTIVE || - (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) - goto unlock; - - /* - * An exclusive counter can't go on if there are already active - * hardware counters, and no hardware counter can go on if there - * is already an exclusive counter on. - */ - if (!group_can_go_on(counter, cpuctx, 1)) - err = -EEXIST; - else - err = counter_sched_in(counter, cpuctx, ctx, cpu); - - if (err) { - /* - * This counter couldn't go on. If it is in a group - * then we have to pull the whole group off. - * If the counter group is pinned then put it in error state. - */ - if (leader != counter) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_COUNTER_STATE_ERROR; - } - } - - if (!err && !ctx->task && cpuctx->max_pertask) - cpuctx->max_pertask--; - - unlock: - perf_enable(); - - spin_unlock(&ctx->lock); -} - -/* - * Attach a performance counter to a context - * - * First we add the counter to the list with the hardware enable bit - * in counter->hw_config cleared. - * - * If the counter is attached to a task which is on a CPU we use a smp - * call to enable it in the task context. The task might have been - * scheduled away, but we check this in the smp call again. - * - * Must be called with ctx->mutex held. - */ -static void -perf_install_in_context(struct perf_counter_context *ctx, - struct perf_counter *counter, - int cpu) -{ - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Per cpu counters are installed via an smp call and - * the install is always sucessful. - */ - smp_call_function_single(cpu, __perf_install_in_context, - counter, 1); - return; - } - -retry: - task_oncpu_function_call(task, __perf_install_in_context, - counter); - - spin_lock_irq(&ctx->lock); - /* - * we need to retry the smp call. - */ - if (ctx->is_active && list_empty(&counter->list_entry)) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * The lock prevents that this context is scheduled in so we - * can add the counter safely, if it the call above did not - * succeed. - */ - if (list_empty(&counter->list_entry)) - add_counter_to_ctx(counter, ctx); - spin_unlock_irq(&ctx->lock); -} - -/* - * Cross CPU call to enable a performance counter - */ -static void __perf_counter_enable(void *info) -{ - struct perf_counter *counter = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *leader = counter->group_leader; - int err; - - /* - * If this is a per-task counter, need to check whether this - * counter's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } - - spin_lock(&ctx->lock); - ctx->is_active = 1; - update_context_time(ctx); - - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - goto unlock; - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time - counter->total_time_enabled; - - /* - * If the counter is in a group and isn't the group leader, - * then don't put it on unless the group is on. - */ - if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) - goto unlock; - - if (!group_can_go_on(counter, cpuctx, 1)) { - err = -EEXIST; - } else { - perf_disable(); - if (counter == leader) - err = group_sched_in(counter, cpuctx, ctx, - smp_processor_id()); - else - err = counter_sched_in(counter, cpuctx, ctx, - smp_processor_id()); - perf_enable(); - } - - if (err) { - /* - * If this counter can't go on and it's part of a - * group, then the whole group has to come off. - */ - if (leader != counter) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_COUNTER_STATE_ERROR; - } - } - - unlock: - spin_unlock(&ctx->lock); -} - -/* - * Enable a counter. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This condition is satisfied when called through - * perf_counter_for_each_child or perf_counter_for_each as described - * for perf_counter_disable. - */ -static void perf_counter_enable(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Enable the counter on the cpu that it's on - */ - smp_call_function_single(counter->cpu, __perf_counter_enable, - counter, 1); - return; - } - - spin_lock_irq(&ctx->lock); - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - goto out; - - /* - * If the counter is in error state, clear that first. - * That way, if we see the counter in error state below, we - * know that it has gone back into error state, as distinct - * from the task having been scheduled away before the - * cross-call arrived. - */ - if (counter->state == PERF_COUNTER_STATE_ERROR) - counter->state = PERF_COUNTER_STATE_OFF; - - retry: - spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_counter_enable, counter); - - spin_lock_irq(&ctx->lock); - - /* - * If the context is active and the counter is still off, - * we need to retry the cross-call. - */ - if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) - goto retry; - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (counter->state == PERF_COUNTER_STATE_OFF) { - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = - ctx->time - counter->total_time_enabled; - } - out: - spin_unlock_irq(&ctx->lock); -} - -static int perf_counter_refresh(struct perf_counter *counter, int refresh) -{ - /* - * not supported on inherited counters - */ - if (counter->attr.inherit) - return -EINVAL; - - atomic_add(refresh, &counter->event_limit); - perf_counter_enable(counter); - - return 0; -} - -void __perf_counter_sched_out(struct perf_counter_context *ctx, - struct perf_cpu_context *cpuctx) -{ - struct perf_counter *counter; - - spin_lock(&ctx->lock); - ctx->is_active = 0; - if (likely(!ctx->nr_counters)) - goto out; - update_context_time(ctx); - - perf_disable(); - if (ctx->nr_active) { - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter != counter->group_leader) - counter_sched_out(counter, cpuctx, ctx); - else - group_sched_out(counter, cpuctx, ctx); - } - } - perf_enable(); - out: - spin_unlock(&ctx->lock); -} - -/* - * Test whether two contexts are equivalent, i.e. whether they - * have both been cloned from the same version of the same context - * and they both have the same number of enabled counters. - * If the number of enabled counters is the same, then the set - * of enabled counters should be the same, because these are both - * inherited contexts, therefore we can't access individual counters - * in them directly with an fd; we can only enable/disable all - * counters via prctl, or enable/disable all counters in a family - * via ioctl, which will have the same effect on both contexts. - */ -static int context_equiv(struct perf_counter_context *ctx1, - struct perf_counter_context *ctx2) -{ - return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && !ctx1->pin_count && !ctx2->pin_count; -} - -static void __perf_counter_read(void *counter); - -static void __perf_counter_sync_stat(struct perf_counter *counter, - struct perf_counter *next_counter) -{ - u64 value; - - if (!counter->attr.inherit_stat) - return; - - /* - * Update the counter value, we cannot use perf_counter_read() - * because we're in the middle of a context switch and have IRQs - * disabled, which upsets smp_call_function_single(), however - * we know the counter must be on the current CPU, therefore we - * don't need to use it. - */ - switch (counter->state) { - case PERF_COUNTER_STATE_ACTIVE: - __perf_counter_read(counter); - break; - - case PERF_COUNTER_STATE_INACTIVE: - update_counter_times(counter); - break; - - default: - break; - } - - /* - * In order to keep per-task stats reliable we need to flip the counter - * values when we flip the contexts. - */ - value = atomic64_read(&next_counter->count); - value = atomic64_xchg(&counter->count, value); - atomic64_set(&next_counter->count, value); - - swap(counter->total_time_enabled, next_counter->total_time_enabled); - swap(counter->total_time_running, next_counter->total_time_running); - - /* - * Since we swizzled the values, update the user visible data too. - */ - perf_counter_update_userpage(counter); - perf_counter_update_userpage(next_counter); -} - -#define list_next_entry(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - -static void perf_counter_sync_stat(struct perf_counter_context *ctx, - struct perf_counter_context *next_ctx) -{ - struct perf_counter *counter, *next_counter; - - if (!ctx->nr_stat) - return; - - counter = list_first_entry(&ctx->event_list, - struct perf_counter, event_entry); - - next_counter = list_first_entry(&next_ctx->event_list, - struct perf_counter, event_entry); - - while (&counter->event_entry != &ctx->event_list && - &next_counter->event_entry != &next_ctx->event_list) { - - __perf_counter_sync_stat(counter, next_counter); - - counter = list_next_entry(counter, event_entry); - next_counter = list_next_entry(next_counter, event_entry); - } -} - -/* - * Called from scheduler to remove the counters of the current task, - * with interrupts disabled. - * - * We stop each counter and update the counter value in counter->count. - * - * This does not protect us against NMI, but disable() - * sets the disabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * not restart the counter. - */ -void perf_counter_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = task->perf_counter_ctxp; - struct perf_counter_context *next_ctx; - struct perf_counter_context *parent; - struct pt_regs *regs; - int do_switch = 1; - - regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); - - if (likely(!ctx || !cpuctx->task_ctx)) - return; - - update_context_time(ctx); - - rcu_read_lock(); - parent = rcu_dereference(ctx->parent_ctx); - next_ctx = next->perf_counter_ctxp; - if (parent && next_ctx && - rcu_dereference(next_ctx->parent_ctx) == parent) { - /* - * Looks like the two contexts are clones, so we might be - * able to optimize the context switch. We lock both - * contexts and check that they are clones under the - * lock (including re-checking that neither has been - * uncloned in the meantime). It doesn't matter which - * order we take the locks because no other cpu could - * be trying to lock both of these tasks. - */ - spin_lock(&ctx->lock); - spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); - if (context_equiv(ctx, next_ctx)) { - /* - * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_counter_ctxp - */ - task->perf_counter_ctxp = next_ctx; - next->perf_counter_ctxp = ctx; - ctx->task = next; - next_ctx->task = task; - do_switch = 0; - - perf_counter_sync_stat(ctx, next_ctx); - } - spin_unlock(&next_ctx->lock); - spin_unlock(&ctx->lock); - } - rcu_read_unlock(); - - if (do_switch) { - __perf_counter_sched_out(ctx, cpuctx); - cpuctx->task_ctx = NULL; - } -} - -/* - * Called with IRQs disabled - */ -static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - __perf_counter_sched_out(ctx, cpuctx); - cpuctx->task_ctx = NULL; -} - -/* - * Called with IRQs disabled - */ -static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) -{ - __perf_counter_sched_out(&cpuctx->ctx, cpuctx); -} - -static void -__perf_counter_sched_in(struct perf_counter_context *ctx, - struct perf_cpu_context *cpuctx, int cpu) -{ - struct perf_counter *counter; - int can_add_hw = 1; - - spin_lock(&ctx->lock); - ctx->is_active = 1; - if (likely(!ctx->nr_counters)) - goto out; - - ctx->timestamp = perf_clock(); - - perf_disable(); - - /* - * First go through the list and put on any pinned groups - * in order to give them the best chance of going on. - */ - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state <= PERF_COUNTER_STATE_OFF || - !counter->attr.pinned) - continue; - if (counter->cpu != -1 && counter->cpu != cpu) - continue; - - if (counter != counter->group_leader) - counter_sched_in(counter, cpuctx, ctx, cpu); - else { - if (group_can_go_on(counter, cpuctx, 1)) - group_sched_in(counter, cpuctx, ctx, cpu); - } - - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_ERROR; - } - } - - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - /* - * Ignore counters in OFF or ERROR state, and - * ignore pinned counters since we did them already. - */ - if (counter->state <= PERF_COUNTER_STATE_OFF || - counter->attr.pinned) - continue; - - /* - * Listen to the 'cpu' scheduling filter constraint - * of counters: - */ - if (counter->cpu != -1 && counter->cpu != cpu) - continue; - - if (counter != counter->group_leader) { - if (counter_sched_in(counter, cpuctx, ctx, cpu)) - can_add_hw = 0; - } else { - if (group_can_go_on(counter, cpuctx, can_add_hw)) { - if (group_sched_in(counter, cpuctx, ctx, cpu)) - can_add_hw = 0; - } - } - } - perf_enable(); - out: - spin_unlock(&ctx->lock); -} - -/* - * Called from scheduler to add the counters of the current task - * with interrupts disabled. - * - * We restore the counter value and then enable it. - * - * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * keep the counter running. - */ -void perf_counter_task_sched_in(struct task_struct *task, int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = task->perf_counter_ctxp; - - if (likely(!ctx)) - return; - if (cpuctx->task_ctx == ctx) - return; - __perf_counter_sched_in(ctx, cpuctx, cpu); - cpuctx->task_ctx = ctx; -} - -static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) -{ - struct perf_counter_context *ctx = &cpuctx->ctx; - - __perf_counter_sched_in(ctx, cpuctx, cpu); -} - -#define MAX_INTERRUPTS (~0ULL) - -static void perf_log_throttle(struct perf_counter *counter, int enable); - -static void perf_adjust_period(struct perf_counter *counter, u64 events) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 period, sample_period; - s64 delta; - - events *= hwc->sample_period; - period = div64_u64(events, counter->attr.sample_freq); - - delta = (s64)(period - hwc->sample_period); - delta = (delta + 7) / 8; /* low pass filter */ - - sample_period = hwc->sample_period + delta; - - if (!sample_period) - sample_period = 1; - - hwc->sample_period = sample_period; -} - -static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - struct hw_perf_counter *hwc; - u64 interrupts, freq; - - spin_lock(&ctx->lock); - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - continue; - - hwc = &counter->hw; - - interrupts = hwc->interrupts; - hwc->interrupts = 0; - - /* - * unthrottle counters on the tick - */ - if (interrupts == MAX_INTERRUPTS) { - perf_log_throttle(counter, 1); - counter->pmu->unthrottle(counter); - interrupts = 2*sysctl_perf_counter_sample_rate/HZ; - } - - if (!counter->attr.freq || !counter->attr.sample_freq) - continue; - - /* - * if the specified freq < HZ then we need to skip ticks - */ - if (counter->attr.sample_freq < HZ) { - freq = counter->attr.sample_freq; - - hwc->freq_count += freq; - hwc->freq_interrupts += interrupts; - - if (hwc->freq_count < HZ) - continue; - - interrupts = hwc->freq_interrupts; - hwc->freq_interrupts = 0; - hwc->freq_count -= HZ; - } else - freq = HZ; - - perf_adjust_period(counter, freq * interrupts); - - /* - * In order to avoid being stalled by an (accidental) huge - * sample period, force reset the sample period if we didn't - * get any events in this freq period. - */ - if (!interrupts) { - perf_disable(); - counter->pmu->disable(counter); - atomic64_set(&hwc->period_left, 0); - counter->pmu->enable(counter); - perf_enable(); - } - } - spin_unlock(&ctx->lock); -} - -/* - * Round-robin a context's counters: - */ -static void rotate_ctx(struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - - if (!ctx->nr_counters) - return; - - spin_lock(&ctx->lock); - /* - * Rotate the first entry last (works just fine for group counters too): - */ - perf_disable(); - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - list_move_tail(&counter->list_entry, &ctx->counter_list); - break; - } - perf_enable(); - - spin_unlock(&ctx->lock); -} - -void perf_counter_task_tick(struct task_struct *curr, int cpu) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - - if (!atomic_read(&nr_counters)) - return; - - cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = curr->perf_counter_ctxp; - - perf_ctx_adjust_freq(&cpuctx->ctx); - if (ctx) - perf_ctx_adjust_freq(ctx); - - perf_counter_cpu_sched_out(cpuctx); - if (ctx) - __perf_counter_task_sched_out(ctx); - - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); - - perf_counter_cpu_sched_in(cpuctx, cpu); - if (ctx) - perf_counter_task_sched_in(curr, cpu); -} - -/* - * Enable all of a task's counters that have been marked enable-on-exec. - * This expects task == current. - */ -static void perf_counter_enable_on_exec(struct task_struct *task) -{ - struct perf_counter_context *ctx; - struct perf_counter *counter; - unsigned long flags; - int enabled = 0; - - local_irq_save(flags); - ctx = task->perf_counter_ctxp; - if (!ctx || !ctx->nr_counters) - goto out; - - __perf_counter_task_sched_out(ctx); - - spin_lock(&ctx->lock); - - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (!counter->attr.enable_on_exec) - continue; - counter->attr.enable_on_exec = 0; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - continue; - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = - ctx->time - counter->total_time_enabled; - enabled = 1; - } - - /* - * Unclone this context if we enabled any counter. - */ - if (enabled) - unclone_ctx(ctx); - - spin_unlock(&ctx->lock); - - perf_counter_task_sched_in(task, smp_processor_id()); - out: - local_irq_restore(flags); -} - -/* - * Cross CPU call to read the hardware counter - */ -static void __perf_counter_read(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - unsigned long flags; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. In that case - * counter->count would have been updated to a recent sample - * when the counter was scheduled out. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - local_irq_save(flags); - if (ctx->is_active) - update_context_time(ctx); - counter->pmu->read(counter); - update_counter_times(counter); - local_irq_restore(flags); -} - -static u64 perf_counter_read(struct perf_counter *counter) -{ - /* - * If counter is enabled and currently active on a CPU, update the - * value in the counter structure: - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - smp_call_function_single(counter->oncpu, - __perf_counter_read, counter, 1); - } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_counter_times(counter); - } - - return atomic64_read(&counter->count); -} - -/* - * Initialize the perf_counter context in a task_struct: - */ -static void -__perf_counter_init_context(struct perf_counter_context *ctx, - struct task_struct *task) -{ - memset(ctx, 0, sizeof(*ctx)); - spin_lock_init(&ctx->lock); - mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->counter_list); - INIT_LIST_HEAD(&ctx->event_list); - atomic_set(&ctx->refcount, 1); - ctx->task = task; -} - -static struct perf_counter_context *find_get_context(pid_t pid, int cpu) -{ - struct perf_counter_context *ctx; - struct perf_cpu_context *cpuctx; - struct task_struct *task; - unsigned long flags; - int err; - - /* - * If cpu is not a wildcard then this is a percpu counter: - */ - if (cpu != -1) { - /* Must be root to operate on a CPU counter: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); - - if (cpu < 0 || cpu > num_possible_cpus()) - return ERR_PTR(-EINVAL); - - /* - * We could be clever and allow to attach a counter to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_isset(cpu, cpu_online_map)) - return ERR_PTR(-ENODEV); - - cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = &cpuctx->ctx; - get_ctx(ctx); - - return ctx; - } - - rcu_read_lock(); - if (!pid) - task = current; - else - task = find_task_by_vpid(pid); - if (task) - get_task_struct(task); - rcu_read_unlock(); - - if (!task) - return ERR_PTR(-ESRCH); - - /* - * Can't attach counters to a dying task. - */ - err = -ESRCH; - if (task->flags & PF_EXITING) - goto errout; - - /* Reuse ptrace permission checks for now. */ - err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto errout; - - retry: - ctx = perf_lock_task_context(task, &flags); - if (ctx) { - unclone_ctx(ctx); - spin_unlock_irqrestore(&ctx->lock, flags); - } - - if (!ctx) { - ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); - err = -ENOMEM; - if (!ctx) - goto errout; - __perf_counter_init_context(ctx, task); - get_ctx(ctx); - if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) { - /* - * We raced with some other task; use - * the context they set. - */ - kfree(ctx); - goto retry; - } - get_task_struct(task); - } - - put_task_struct(task); - return ctx; - - errout: - put_task_struct(task); - return ERR_PTR(err); -} - -static void free_counter_rcu(struct rcu_head *head) -{ - struct perf_counter *counter; - - counter = container_of(head, struct perf_counter, rcu_head); - if (counter->ns) - put_pid_ns(counter->ns); - kfree(counter); -} - -static void perf_pending_sync(struct perf_counter *counter); - -static void free_counter(struct perf_counter *counter) -{ - perf_pending_sync(counter); - - if (!counter->parent) { - atomic_dec(&nr_counters); - if (counter->attr.mmap) - atomic_dec(&nr_mmap_counters); - if (counter->attr.comm) - atomic_dec(&nr_comm_counters); - if (counter->attr.task) - atomic_dec(&nr_task_counters); - } - - if (counter->destroy) - counter->destroy(counter); - - put_ctx(counter->ctx); - call_rcu(&counter->rcu_head, free_counter_rcu); -} - -/* - * Called when the last reference to the file is gone. - */ -static int perf_release(struct inode *inode, struct file *file) -{ - struct perf_counter *counter = file->private_data; - struct perf_counter_context *ctx = counter->ctx; - - file->private_data = NULL; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_counter_remove_from_context(counter); - mutex_unlock(&ctx->mutex); - - mutex_lock(&counter->owner->perf_counter_mutex); - list_del_init(&counter->owner_entry); - mutex_unlock(&counter->owner->perf_counter_mutex); - put_task_struct(counter->owner); - - free_counter(counter); - - return 0; -} - -static int perf_counter_read_size(struct perf_counter *counter) -{ - int entry = sizeof(u64); /* value */ - int size = 0; - int nr = 1; - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - size += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - size += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_ID) - entry += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_GROUP) { - nr += counter->group_leader->nr_siblings; - size += sizeof(u64); - } - - size += entry * nr; - - return size; -} - -static u64 perf_counter_read_value(struct perf_counter *counter) -{ - struct perf_counter *child; - u64 total = 0; - - total += perf_counter_read(counter); - list_for_each_entry(child, &counter->child_list, child_list) - total += perf_counter_read(child); - - return total; -} - -static int perf_counter_read_entry(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - int n = 0, count = 0; - u64 values[2]; - - values[n++] = perf_counter_read_value(counter); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; -} - -static int perf_counter_read_group(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - struct perf_counter *leader = counter->group_leader, *sub; - int n = 0, size = 0, err = -EFAULT; - u64 values[3]; - - values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = leader->total_time_enabled + - atomic64_read(&leader->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = leader->total_time_running + - atomic64_read(&leader->child_total_time_running); - } - - size = n * sizeof(u64); - - if (copy_to_user(buf, values, size)) - return -EFAULT; - - err = perf_counter_read_entry(leader, read_format, buf + size); - if (err < 0) - return err; - - size += err; - - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - err = perf_counter_read_entry(sub, read_format, - buf + size); - if (err < 0) - return err; - - size += err; - } - - return size; -} - -static int perf_counter_read_one(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - u64 values[4]; - int n = 0; - - values[n++] = perf_counter_read_value(counter); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - } - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - if (copy_to_user(buf, values, n * sizeof(u64))) - return -EFAULT; - - return n * sizeof(u64); -} - -/* - * Read the performance counter - simple non blocking version for now - */ -static ssize_t -perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) -{ - u64 read_format = counter->attr.read_format; - int ret; - - /* - * Return end-of-file for a read on a counter that is in - * error state (i.e. because it was pinned but it couldn't be - * scheduled on to the CPU at some point). - */ - if (counter->state == PERF_COUNTER_STATE_ERROR) - return 0; - - if (count < perf_counter_read_size(counter)) - return -ENOSPC; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); - if (read_format & PERF_FORMAT_GROUP) - ret = perf_counter_read_group(counter, read_format, buf); - else - ret = perf_counter_read_one(counter, read_format, buf); - mutex_unlock(&counter->child_mutex); - - return ret; -} - -static ssize_t -perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) -{ - struct perf_counter *counter = file->private_data; - - return perf_read_hw(counter, buf, count); -} - -static unsigned int perf_poll(struct file *file, poll_table *wait) -{ - struct perf_counter *counter = file->private_data; - struct perf_mmap_data *data; - unsigned int events = POLL_HUP; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (data) - events = atomic_xchg(&data->poll, 0); - rcu_read_unlock(); - - poll_wait(file, &counter->waitq, wait); - - return events; -} - -static void perf_counter_reset(struct perf_counter *counter) -{ - (void)perf_counter_read(counter); - atomic64_set(&counter->count, 0); - perf_counter_update_userpage(counter); -} - -/* - * Holding the top-level counter's child_mutex means that any - * descendant process that has inherited this counter will block - * in sync_child_counter if it goes to exit, thus satisfying the - * task existence requirements of perf_counter_enable/disable. - */ -static void perf_counter_for_each_child(struct perf_counter *counter, - void (*func)(struct perf_counter *)) -{ - struct perf_counter *child; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); - func(counter); - list_for_each_entry(child, &counter->child_list, child_list) - func(child); - mutex_unlock(&counter->child_mutex); -} - -static void perf_counter_for_each(struct perf_counter *counter, - void (*func)(struct perf_counter *)) -{ - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *sibling; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - counter = counter->group_leader; - - perf_counter_for_each_child(counter, func); - func(counter); - list_for_each_entry(sibling, &counter->sibling_list, list_entry) - perf_counter_for_each_child(counter, func); - mutex_unlock(&ctx->mutex); -} - -static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) -{ - struct perf_counter_context *ctx = counter->ctx; - unsigned long size; - int ret = 0; - u64 value; - - if (!counter->attr.sample_period) - return -EINVAL; - - size = copy_from_user(&value, arg, sizeof(value)); - if (size != sizeof(value)) - return -EFAULT; - - if (!value) - return -EINVAL; - - spin_lock_irq(&ctx->lock); - if (counter->attr.freq) { - if (value > sysctl_perf_counter_sample_rate) { - ret = -EINVAL; - goto unlock; - } - - counter->attr.sample_freq = value; - } else { - counter->attr.sample_period = value; - counter->hw.sample_period = value; - } -unlock: - spin_unlock_irq(&ctx->lock); - - return ret; -} - -static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct perf_counter *counter = file->private_data; - void (*func)(struct perf_counter *); - u32 flags = arg; - - switch (cmd) { - case PERF_COUNTER_IOC_ENABLE: - func = perf_counter_enable; - break; - case PERF_COUNTER_IOC_DISABLE: - func = perf_counter_disable; - break; - case PERF_COUNTER_IOC_RESET: - func = perf_counter_reset; - break; - - case PERF_COUNTER_IOC_REFRESH: - return perf_counter_refresh(counter, arg); - - case PERF_COUNTER_IOC_PERIOD: - return perf_counter_period(counter, (u64 __user *)arg); - - default: - return -ENOTTY; - } - - if (flags & PERF_IOC_FLAG_GROUP) - perf_counter_for_each(counter, func); - else - perf_counter_for_each_child(counter, func); - - return 0; -} - -int perf_counter_task_enable(void) -{ - struct perf_counter *counter; - - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_for_each_child(counter, perf_counter_enable); - mutex_unlock(¤t->perf_counter_mutex); - - return 0; -} - -int perf_counter_task_disable(void) -{ - struct perf_counter *counter; - - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_for_each_child(counter, perf_counter_disable); - mutex_unlock(¤t->perf_counter_mutex); - - return 0; -} - -#ifndef PERF_COUNTER_INDEX_OFFSET -# define PERF_COUNTER_INDEX_OFFSET 0 -#endif - -static int perf_counter_index(struct perf_counter *counter) -{ - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return 0; - - return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET; -} - -/* - * Callers need to ensure there can be no nesting of this function, otherwise - * the seqlock logic goes bad. We can not serialize this because the arch - * code calls this from NMI context. - */ -void perf_counter_update_userpage(struct perf_counter *counter) -{ - struct perf_counter_mmap_page *userpg; - struct perf_mmap_data *data; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (!data) - goto unlock; - - userpg = data->user_page; - - /* - * Disable preemption so as to not let the corresponding user-space - * spin too long if we get preempted. - */ - preempt_disable(); - ++userpg->lock; - barrier(); - userpg->index = perf_counter_index(counter); - userpg->offset = atomic64_read(&counter->count); - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - userpg->offset -= atomic64_read(&counter->hw.prev_count); - - userpg->time_enabled = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - - userpg->time_running = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - - barrier(); - ++userpg->lock; - preempt_enable(); -unlock: - rcu_read_unlock(); -} - -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct perf_counter *counter = vma->vm_file->private_data; - struct perf_mmap_data *data; - int ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (!data) - goto unlock; - - if (vmf->pgoff == 0) { - vmf->page = virt_to_page(data->user_page); - } else { - int nr = vmf->pgoff - 1; - - if ((unsigned)nr > data->nr_pages) - goto unlock; - - if (vmf->flags & FAULT_FLAG_WRITE) - goto unlock; - - vmf->page = virt_to_page(data->data_pages[nr]); - } - - get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; -} - -static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) -{ - struct perf_mmap_data *data; - unsigned long size; - int i; - - WARN_ON(atomic_read(&counter->mmap_count)); - - size = sizeof(struct perf_mmap_data); - size += nr_pages * sizeof(void *); - - data = kzalloc(size, GFP_KERNEL); - if (!data) - goto fail; - - data->user_page = (void *)get_zeroed_page(GFP_KERNEL); - if (!data->user_page) - goto fail_user_page; - - for (i = 0; i < nr_pages; i++) { - data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); - if (!data->data_pages[i]) - goto fail_data_pages; - } - - data->nr_pages = nr_pages; - atomic_set(&data->lock, -1); - - rcu_assign_pointer(counter->data, data); - - return 0; - -fail_data_pages: - for (i--; i >= 0; i--) - free_page((unsigned long)data->data_pages[i]); - - free_page((unsigned long)data->user_page); - -fail_user_page: - kfree(data); - -fail: - return -ENOMEM; -} - -static void perf_mmap_free_page(unsigned long addr) -{ - struct page *page = virt_to_page((void *)addr); - - page->mapping = NULL; - __free_page(page); -} - -static void __perf_mmap_data_free(struct rcu_head *rcu_head) -{ - struct perf_mmap_data *data; - int i; - - data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - - perf_mmap_free_page((unsigned long)data->user_page); - for (i = 0; i < data->nr_pages; i++) - perf_mmap_free_page((unsigned long)data->data_pages[i]); - - kfree(data); -} - -static void perf_mmap_data_free(struct perf_counter *counter) -{ - struct perf_mmap_data *data = counter->data; - - WARN_ON(atomic_read(&counter->mmap_count)); - - rcu_assign_pointer(counter->data, NULL); - call_rcu(&data->rcu_head, __perf_mmap_data_free); -} - -static void perf_mmap_open(struct vm_area_struct *vma) -{ - struct perf_counter *counter = vma->vm_file->private_data; - - atomic_inc(&counter->mmap_count); -} - -static void perf_mmap_close(struct vm_area_struct *vma) -{ - struct perf_counter *counter = vma->vm_file->private_data; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { - struct user_struct *user = current_user(); - - atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= counter->data->nr_locked; - perf_mmap_data_free(counter); - mutex_unlock(&counter->mmap_mutex); - } -} - -static struct vm_operations_struct perf_mmap_vmops = { - .open = perf_mmap_open, - .close = perf_mmap_close, - .fault = perf_mmap_fault, - .page_mkwrite = perf_mmap_fault, -}; - -static int perf_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct perf_counter *counter = file->private_data; - unsigned long user_locked, user_lock_limit; - struct user_struct *user = current_user(); - unsigned long locked, lock_limit; - unsigned long vma_size; - unsigned long nr_pages; - long user_extra, extra; - int ret = 0; - - if (!(vma->vm_flags & VM_SHARED)) - return -EINVAL; - - vma_size = vma->vm_end - vma->vm_start; - nr_pages = (vma_size / PAGE_SIZE) - 1; - - /* - * If we have data pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - return -EINVAL; - - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - - if (vma->vm_pgoff != 0) - return -EINVAL; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->mmap_mutex); - if (atomic_inc_not_zero(&counter->mmap_count)) { - if (nr_pages != counter->data->nr_pages) - ret = -EINVAL; - goto unlock; - } - - user_extra = nr_pages + 1; - user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); - - /* - * Increase the limit linearly with more CPUs: - */ - user_lock_limit *= num_online_cpus(); - - user_locked = atomic_long_read(&user->locked_vm) + user_extra; - - extra = 0; - if (user_locked > user_lock_limit) - extra = user_locked - user_lock_limit; - - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; - lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->locked_vm + extra; - - if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { - ret = -EPERM; - goto unlock; - } - - WARN_ON(counter->data); - ret = perf_mmap_data_alloc(counter, nr_pages); - if (ret) - goto unlock; - - atomic_set(&counter->mmap_count, 1); - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->locked_vm += extra; - counter->data->nr_locked = extra; - if (vma->vm_flags & VM_WRITE) - counter->data->writable = 1; - -unlock: - mutex_unlock(&counter->mmap_mutex); - - vma->vm_flags |= VM_RESERVED; - vma->vm_ops = &perf_mmap_vmops; - - return ret; -} - -static int perf_fasync(int fd, struct file *filp, int on) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - struct perf_counter *counter = filp->private_data; - int retval; - - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &counter->fasync); - mutex_unlock(&inode->i_mutex); - - if (retval < 0) - return retval; - - return 0; -} - -static const struct file_operations perf_fops = { - .release = perf_release, - .read = perf_read, - .poll = perf_poll, - .unlocked_ioctl = perf_ioctl, - .compat_ioctl = perf_ioctl, - .mmap = perf_mmap, - .fasync = perf_fasync, -}; - -/* - * Perf counter wakeup - * - * If there's data, ensure we set the poll() state and publish everything - * to user-space before waking everybody up. - */ - -void perf_counter_wakeup(struct perf_counter *counter) -{ - wake_up_all(&counter->waitq); - - if (counter->pending_kill) { - kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); - counter->pending_kill = 0; - } -} - -/* - * Pending wakeups - * - * Handle the case where we need to wakeup up from NMI (or rq->lock) context. - * - * The NMI bit means we cannot possibly take locks. Therefore, maintain a - * single linked list and use cmpxchg() to add entries lockless. - */ - -static void perf_pending_counter(struct perf_pending_entry *entry) -{ - struct perf_counter *counter = container_of(entry, - struct perf_counter, pending); - - if (counter->pending_disable) { - counter->pending_disable = 0; - __perf_counter_disable(counter); - } - - if (counter->pending_wakeup) { - counter->pending_wakeup = 0; - perf_counter_wakeup(counter); - } -} - -#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) - -static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { - PENDING_TAIL, -}; - -static void perf_pending_queue(struct perf_pending_entry *entry, - void (*func)(struct perf_pending_entry *)) -{ - struct perf_pending_entry **head; - - if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) - return; - - entry->func = func; - - head = &get_cpu_var(perf_pending_head); - - do { - entry->next = *head; - } while (cmpxchg(head, entry->next, entry) != entry->next); - - set_perf_counter_pending(); - - put_cpu_var(perf_pending_head); -} - -static int __perf_pending_run(void) -{ - struct perf_pending_entry *list; - int nr = 0; - - list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); - while (list != PENDING_TAIL) { - void (*func)(struct perf_pending_entry *); - struct perf_pending_entry *entry = list; - - list = list->next; - - func = entry->func; - entry->next = NULL; - /* - * Ensure we observe the unqueue before we issue the wakeup, - * so that we won't be waiting forever. - * -- see perf_not_pending(). - */ - smp_wmb(); - - func(entry); - nr++; - } - - return nr; -} - -static inline int perf_not_pending(struct perf_counter *counter) -{ - /* - * If we flush on whatever cpu we run, there is a chance we don't - * need to wait. - */ - get_cpu(); - __perf_pending_run(); - put_cpu(); - - /* - * Ensure we see the proper queue state before going to sleep - * so that we do not miss the wakeup. -- see perf_pending_handle() - */ - smp_rmb(); - return counter->pending.next == NULL; -} - -static void perf_pending_sync(struct perf_counter *counter) -{ - wait_event(counter->waitq, perf_not_pending(counter)); -} - -void perf_counter_do_pending(void) -{ - __perf_pending_run(); -} - -/* - * Callchain support -- arch specific - */ - -__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - return NULL; -} - -/* - * Output - */ - -struct perf_output_handle { - struct perf_counter *counter; - struct perf_mmap_data *data; - unsigned long head; - unsigned long offset; - int nmi; - int sample; - int locked; - unsigned long flags; -}; - -static bool perf_output_space(struct perf_mmap_data *data, - unsigned int offset, unsigned int head) -{ - unsigned long tail; - unsigned long mask; - - if (!data->writable) - return true; - - mask = (data->nr_pages << PAGE_SHIFT) - 1; - /* - * Userspace could choose to issue a mb() before updating the tail - * pointer. So that all reads will be completed before the write is - * issued. - */ - tail = ACCESS_ONCE(data->user_page->data_tail); - smp_rmb(); - - offset = (offset - tail) & mask; - head = (head - tail) & mask; - - if ((int)(head - offset) < 0) - return false; - - return true; -} - -static void perf_output_wakeup(struct perf_output_handle *handle) -{ - atomic_set(&handle->data->poll, POLL_IN); - - if (handle->nmi) { - handle->counter->pending_wakeup = 1; - perf_pending_queue(&handle->counter->pending, - perf_pending_counter); - } else - perf_counter_wakeup(handle->counter); -} - -/* - * Curious locking construct. - * - * We need to ensure a later event doesn't publish a head when a former - * event isn't done writing. However since we need to deal with NMIs we - * cannot fully serialize things. - * - * What we do is serialize between CPUs so we only have to deal with NMI - * nesting on a single CPU. - * - * We only publish the head (and generate a wakeup) when the outer-most - * event completes. - */ -static void perf_output_lock(struct perf_output_handle *handle) -{ - struct perf_mmap_data *data = handle->data; - int cpu; - - handle->locked = 0; - - local_irq_save(handle->flags); - cpu = smp_processor_id(); - - if (in_nmi() && atomic_read(&data->lock) == cpu) - return; - - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) - cpu_relax(); - - handle->locked = 1; -} - -static void perf_output_unlock(struct perf_output_handle *handle) -{ - struct perf_mmap_data *data = handle->data; - unsigned long head; - int cpu; - - data->done_head = data->head; - - if (!handle->locked) - goto out; - -again: - /* - * The xchg implies a full barrier that ensures all writes are done - * before we publish the new head, matched by a rmb() in userspace when - * reading this position. - */ - while ((head = atomic_long_xchg(&data->done_head, 0))) - data->user_page->data_head = head; - - /* - * NMI can happen here, which means we can miss a done_head update. - */ - - cpu = atomic_xchg(&data->lock, -1); - WARN_ON_ONCE(cpu != smp_processor_id()); - - /* - * Therefore we have to validate we did not indeed do so. - */ - if (unlikely(atomic_long_read(&data->done_head))) { - /* - * Since we had it locked, we can lock it again. - */ - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) - cpu_relax(); - - goto again; - } - - if (atomic_xchg(&data->wakeup, 0)) - perf_output_wakeup(handle); -out: - local_irq_restore(handle->flags); -} - -static void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) -{ - unsigned int pages_mask; - unsigned int offset; - unsigned int size; - void **pages; - - offset = handle->offset; - pages_mask = handle->data->nr_pages - 1; - pages = handle->data->data_pages; - - do { - unsigned int page_offset; - int nr; - - nr = (offset >> PAGE_SHIFT) & pages_mask; - page_offset = offset & (PAGE_SIZE - 1); - size = min_t(unsigned int, PAGE_SIZE - page_offset, len); - - memcpy(pages[nr] + page_offset, buf, size); - - len -= size; - buf += size; - offset += size; - } while (len); - - handle->offset = offset; - - /* - * Check we didn't copy past our reservation window, taking the - * possible unsigned int wrap into account. - */ - WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); -} - -#define perf_output_put(handle, x) \ - perf_output_copy((handle), &(x), sizeof(x)) - -static int perf_output_begin(struct perf_output_handle *handle, - struct perf_counter *counter, unsigned int size, - int nmi, int sample) -{ - struct perf_mmap_data *data; - unsigned int offset, head; - int have_lost; - struct { - struct perf_event_header header; - u64 id; - u64 lost; - } lost_event; - - /* - * For inherited counters we send all the output towards the parent. - */ - if (counter->parent) - counter = counter->parent; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (!data) - goto out; - - handle->data = data; - handle->counter = counter; - handle->nmi = nmi; - handle->sample = sample; - - if (!data->nr_pages) - goto fail; - - have_lost = atomic_read(&data->lost); - if (have_lost) - size += sizeof(lost_event); - - perf_output_lock(handle); - - do { - offset = head = atomic_long_read(&data->head); - head += size; - if (unlikely(!perf_output_space(data, offset, head))) - goto fail; - } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); - - handle->offset = offset; - handle->head = head; - - if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) - atomic_set(&data->wakeup, 1); - - if (have_lost) { - lost_event.header.type = PERF_EVENT_LOST; - lost_event.header.misc = 0; - lost_event.header.size = sizeof(lost_event); - lost_event.id = counter->id; - lost_event.lost = atomic_xchg(&data->lost, 0); - - perf_output_put(handle, lost_event); - } - - return 0; - -fail: - atomic_inc(&data->lost); - perf_output_unlock(handle); -out: - rcu_read_unlock(); - - return -ENOSPC; -} - -static void perf_output_end(struct perf_output_handle *handle) -{ - struct perf_counter *counter = handle->counter; - struct perf_mmap_data *data = handle->data; - - int wakeup_events = counter->attr.wakeup_events; - - if (handle->sample && wakeup_events) { - int events = atomic_inc_return(&data->events); - if (events >= wakeup_events) { - atomic_sub(wakeup_events, &data->events); - atomic_set(&data->wakeup, 1); - } - } - - perf_output_unlock(handle); - rcu_read_unlock(); -} - -static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p) -{ - /* - * only top level counters have the pid namespace they were created in - */ - if (counter->parent) - counter = counter->parent; - - return task_tgid_nr_ns(p, counter->ns); -} - -static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) -{ - /* - * only top level counters have the pid namespace they were created in - */ - if (counter->parent) - counter = counter->parent; - - return task_pid_nr_ns(p, counter->ns); -} - -static void perf_output_read_one(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - u64 read_format = counter->attr.read_format; - u64 values[4]; - int n = 0; - - values[n++] = atomic64_read(&counter->count); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - } - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - perf_output_copy(handle, values, n * sizeof(u64)); -} - -/* - * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. - */ -static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - struct perf_counter *leader = counter->group_leader, *sub; - u64 read_format = counter->attr.read_format; - u64 values[5]; - int n = 0; - - values[n++] = 1 + leader->nr_siblings; - - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = leader->total_time_enabled; - - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = leader->total_time_running; - - if (leader != counter) - leader->pmu->read(leader); - - values[n++] = atomic64_read(&leader->count); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(leader); - - perf_output_copy(handle, values, n * sizeof(u64)); - - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - n = 0; - - if (sub != counter) - sub->pmu->read(sub); - - values[n++] = atomic64_read(&sub->count); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(sub); - - perf_output_copy(handle, values, n * sizeof(u64)); - } -} - -static void perf_output_read(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - if (counter->attr.read_format & PERF_FORMAT_GROUP) - perf_output_read_group(handle, counter); - else - perf_output_read_one(handle, counter); -} - -void perf_counter_output(struct perf_counter *counter, int nmi, - struct perf_sample_data *data) -{ - int ret; - u64 sample_type = counter->attr.sample_type; - struct perf_output_handle handle; - struct perf_event_header header; - u64 ip; - struct { - u32 pid, tid; - } tid_entry; - struct perf_callchain_entry *callchain = NULL; - int callchain_size = 0; - u64 time; - struct { - u32 cpu, reserved; - } cpu_entry; - - header.type = PERF_EVENT_SAMPLE; - header.size = sizeof(header); - - header.misc = 0; - header.misc |= perf_misc_flags(data->regs); - - if (sample_type & PERF_SAMPLE_IP) { - ip = perf_instruction_pointer(data->regs); - header.size += sizeof(ip); - } - - if (sample_type & PERF_SAMPLE_TID) { - /* namespace issues */ - tid_entry.pid = perf_counter_pid(counter, current); - tid_entry.tid = perf_counter_tid(counter, current); - - header.size += sizeof(tid_entry); - } - - if (sample_type & PERF_SAMPLE_TIME) { - /* - * Maybe do better on x86 and provide cpu_clock_nmi() - */ - time = sched_clock(); - - header.size += sizeof(u64); - } - - if (sample_type & PERF_SAMPLE_ADDR) - header.size += sizeof(u64); - - if (sample_type & PERF_SAMPLE_ID) - header.size += sizeof(u64); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - header.size += sizeof(u64); - - if (sample_type & PERF_SAMPLE_CPU) { - header.size += sizeof(cpu_entry); - - cpu_entry.cpu = raw_smp_processor_id(); - cpu_entry.reserved = 0; - } - - if (sample_type & PERF_SAMPLE_PERIOD) - header.size += sizeof(u64); - - if (sample_type & PERF_SAMPLE_READ) - header.size += perf_counter_read_size(counter); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - callchain = perf_callchain(data->regs); - - if (callchain) { - callchain_size = (1 + callchain->nr) * sizeof(u64); - header.size += callchain_size; - } else - header.size += sizeof(u64); - } - - if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); - - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); - - WARN_ON_ONCE(size & (sizeof(u64)-1)); - header.size += size; - } - - ret = perf_output_begin(&handle, counter, header.size, nmi, 1); - if (ret) - return; - - perf_output_put(&handle, header); - - if (sample_type & PERF_SAMPLE_IP) - perf_output_put(&handle, ip); - - if (sample_type & PERF_SAMPLE_TID) - perf_output_put(&handle, tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - perf_output_put(&handle, time); - - if (sample_type & PERF_SAMPLE_ADDR) - perf_output_put(&handle, data->addr); - - if (sample_type & PERF_SAMPLE_ID) { - u64 id = primary_counter_id(counter); - - perf_output_put(&handle, id); - } - - if (sample_type & PERF_SAMPLE_STREAM_ID) - perf_output_put(&handle, counter->id); - - if (sample_type & PERF_SAMPLE_CPU) - perf_output_put(&handle, cpu_entry); - - if (sample_type & PERF_SAMPLE_PERIOD) - perf_output_put(&handle, data->period); - - if (sample_type & PERF_SAMPLE_READ) - perf_output_read(&handle, counter); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - if (callchain) - perf_output_copy(&handle, callchain, callchain_size); - else { - u64 nr = 0; - perf_output_put(&handle, nr); - } - } - - if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - perf_output_put(&handle, data->raw->size); - perf_output_copy(&handle, data->raw->data, data->raw->size); - } else { - struct { - u32 size; - u32 data; - } raw = { - .size = sizeof(u32), - .data = 0, - }; - perf_output_put(&handle, raw); - } - } - - perf_output_end(&handle); -} - -/* - * read event - */ - -struct perf_read_event { - struct perf_event_header header; - - u32 pid; - u32 tid; -}; - -static void -perf_counter_read_event(struct perf_counter *counter, - struct task_struct *task) -{ - struct perf_output_handle handle; - struct perf_read_event event = { - .header = { - .type = PERF_EVENT_READ, - .misc = 0, - .size = sizeof(event) + perf_counter_read_size(counter), - }, - .pid = perf_counter_pid(counter, task), - .tid = perf_counter_tid(counter, task), - }; - int ret; - - ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); - if (ret) - return; - - perf_output_put(&handle, event); - perf_output_read(&handle, counter); - - perf_output_end(&handle); -} - -/* - * task tracking -- fork/exit - * - * enabled by: attr.comm | attr.mmap | attr.task - */ - -struct perf_task_event { - struct task_struct *task; - struct perf_counter_context *task_ctx; - - struct { - struct perf_event_header header; - - u32 pid; - u32 ppid; - u32 tid; - u32 ptid; - } event; -}; - -static void perf_counter_task_output(struct perf_counter *counter, - struct perf_task_event *task_event) -{ - struct perf_output_handle handle; - int size = task_event->event.header.size; - struct task_struct *task = task_event->task; - int ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - task_event->event.pid = perf_counter_pid(counter, task); - task_event->event.ppid = perf_counter_pid(counter, current); - - task_event->event.tid = perf_counter_tid(counter, task); - task_event->event.ptid = perf_counter_tid(counter, current); - - perf_output_put(&handle, task_event->event); - perf_output_end(&handle); -} - -static int perf_counter_task_match(struct perf_counter *counter) -{ - if (counter->attr.comm || counter->attr.mmap || counter->attr.task) - return 1; - - return 0; -} - -static void perf_counter_task_ctx(struct perf_counter_context *ctx, - struct perf_task_event *task_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_task_match(counter)) - perf_counter_task_output(counter, task_event); - } - rcu_read_unlock(); -} - -static void perf_counter_task_event(struct perf_task_event *task_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx = task_event->task_ctx; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_task_ctx(&cpuctx->ctx, task_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - if (!ctx) - ctx = rcu_dereference(task_event->task->perf_counter_ctxp); - if (ctx) - perf_counter_task_ctx(ctx, task_event); - rcu_read_unlock(); -} - -static void perf_counter_task(struct task_struct *task, - struct perf_counter_context *task_ctx, - int new) -{ - struct perf_task_event task_event; - - if (!atomic_read(&nr_comm_counters) && - !atomic_read(&nr_mmap_counters) && - !atomic_read(&nr_task_counters)) - return; - - task_event = (struct perf_task_event){ - .task = task, - .task_ctx = task_ctx, - .event = { - .header = { - .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, - .misc = 0, - .size = sizeof(task_event.event), - }, - /* .pid */ - /* .ppid */ - /* .tid */ - /* .ptid */ - }, - }; - - perf_counter_task_event(&task_event); -} - -void perf_counter_fork(struct task_struct *task) -{ - perf_counter_task(task, NULL, 1); -} - -/* - * comm tracking - */ - -struct perf_comm_event { - struct task_struct *task; - char *comm; - int comm_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - } event; -}; - -static void perf_counter_comm_output(struct perf_counter *counter, - struct perf_comm_event *comm_event) -{ - struct perf_output_handle handle; - int size = comm_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - comm_event->event.pid = perf_counter_pid(counter, comm_event->task); - comm_event->event.tid = perf_counter_tid(counter, comm_event->task); - - perf_output_put(&handle, comm_event->event); - perf_output_copy(&handle, comm_event->comm, - comm_event->comm_size); - perf_output_end(&handle); -} - -static int perf_counter_comm_match(struct perf_counter *counter) -{ - if (counter->attr.comm) - return 1; - - return 0; -} - -static void perf_counter_comm_ctx(struct perf_counter_context *ctx, - struct perf_comm_event *comm_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_comm_match(counter)) - perf_counter_comm_output(counter, comm_event); - } - rcu_read_unlock(); -} - -static void perf_counter_comm_event(struct perf_comm_event *comm_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - unsigned int size; - char comm[TASK_COMM_LEN]; - - memset(comm, 0, sizeof(comm)); - strncpy(comm, comm_event->task->comm, sizeof(comm)); - size = ALIGN(strlen(comm)+1, sizeof(u64)); - - comm_event->comm = comm; - comm_event->comm_size = size; - - comm_event->event.header.size = sizeof(comm_event->event) + size; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_comm_ctx(&cpuctx->ctx, comm_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_counter_comm_ctx(ctx, comm_event); - rcu_read_unlock(); -} - -void perf_counter_comm(struct task_struct *task) -{ - struct perf_comm_event comm_event; - - if (task->perf_counter_ctxp) - perf_counter_enable_on_exec(task); - - if (!atomic_read(&nr_comm_counters)) - return; - - comm_event = (struct perf_comm_event){ - .task = task, - /* .comm */ - /* .comm_size */ - .event = { - .header = { - .type = PERF_EVENT_COMM, - .misc = 0, - /* .size */ - }, - /* .pid */ - /* .tid */ - }, - }; - - perf_counter_comm_event(&comm_event); -} - -/* - * mmap tracking - */ - -struct perf_mmap_event { - struct vm_area_struct *vma; - - const char *file_name; - int file_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - u64 start; - u64 len; - u64 pgoff; - } event; -}; - -static void perf_counter_mmap_output(struct perf_counter *counter, - struct perf_mmap_event *mmap_event) -{ - struct perf_output_handle handle; - int size = mmap_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - mmap_event->event.pid = perf_counter_pid(counter, current); - mmap_event->event.tid = perf_counter_tid(counter, current); - - perf_output_put(&handle, mmap_event->event); - perf_output_copy(&handle, mmap_event->file_name, - mmap_event->file_size); - perf_output_end(&handle); -} - -static int perf_counter_mmap_match(struct perf_counter *counter, - struct perf_mmap_event *mmap_event) -{ - if (counter->attr.mmap) - return 1; - - return 0; -} - -static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, - struct perf_mmap_event *mmap_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_mmap_match(counter, mmap_event)) - perf_counter_mmap_output(counter, mmap_event); - } - rcu_read_unlock(); -} - -static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - struct vm_area_struct *vma = mmap_event->vma; - struct file *file = vma->vm_file; - unsigned int size; - char tmp[16]; - char *buf = NULL; - const char *name; - - memset(tmp, 0, sizeof(tmp)); - - if (file) { - /* - * d_path works from the end of the buffer backwards, so we - * need to add enough zero bytes after the string to handle - * the 64bit alignment we do later. - */ - buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); - if (!buf) { - name = strncpy(tmp, "//enomem", sizeof(tmp)); - goto got_name; - } - name = d_path(&file->f_path, buf, PATH_MAX); - if (IS_ERR(name)) { - name = strncpy(tmp, "//toolong", sizeof(tmp)); - goto got_name; - } - } else { - if (arch_vma_name(mmap_event->vma)) { - name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp)); - goto got_name; - } - - if (!vma->vm_mm) { - name = strncpy(tmp, "[vdso]", sizeof(tmp)); - goto got_name; - } - - name = strncpy(tmp, "//anon", sizeof(tmp)); - goto got_name; - } - -got_name: - size = ALIGN(strlen(name)+1, sizeof(u64)); - - mmap_event->file_name = name; - mmap_event->file_size = size; - - mmap_event->event.header.size = sizeof(mmap_event->event) + size; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_counter_mmap_ctx(ctx, mmap_event); - rcu_read_unlock(); - - kfree(buf); -} - -void __perf_counter_mmap(struct vm_area_struct *vma) -{ - struct perf_mmap_event mmap_event; - - if (!atomic_read(&nr_mmap_counters)) - return; - - mmap_event = (struct perf_mmap_event){ - .vma = vma, - /* .file_name */ - /* .file_size */ - .event = { - .header = { - .type = PERF_EVENT_MMAP, - .misc = 0, - /* .size */ - }, - /* .pid */ - /* .tid */ - .start = vma->vm_start, - .len = vma->vm_end - vma->vm_start, - .pgoff = vma->vm_pgoff, - }, - }; - - perf_counter_mmap_event(&mmap_event); -} - -/* - * IRQ throttle logging - */ - -static void perf_log_throttle(struct perf_counter *counter, int enable) -{ - struct perf_output_handle handle; - int ret; - - struct { - struct perf_event_header header; - u64 time; - u64 id; - u64 stream_id; - } throttle_event = { - .header = { - .type = PERF_EVENT_THROTTLE, - .misc = 0, - .size = sizeof(throttle_event), - }, - .time = sched_clock(), - .id = primary_counter_id(counter), - .stream_id = counter->id, - }; - - if (enable) - throttle_event.header.type = PERF_EVENT_UNTHROTTLE; - - ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); - if (ret) - return; - - perf_output_put(&handle, throttle_event); - perf_output_end(&handle); -} - -/* - * Generic counter overflow handling, sampling. - */ - -int perf_counter_overflow(struct perf_counter *counter, int nmi, - struct perf_sample_data *data) -{ - int events = atomic_read(&counter->event_limit); - int throttle = counter->pmu->unthrottle != NULL; - struct hw_perf_counter *hwc = &counter->hw; - int ret = 0; - - if (!throttle) { - hwc->interrupts++; - } else { - if (hwc->interrupts != MAX_INTERRUPTS) { - hwc->interrupts++; - if (HZ * hwc->interrupts > - (u64)sysctl_perf_counter_sample_rate) { - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(counter, 0); - ret = 1; - } - } else { - /* - * Keep re-disabling counters even though on the previous - * pass we disabled it - just in case we raced with a - * sched-in and the counter got enabled again: - */ - ret = 1; - } - } - - if (counter->attr.freq) { - u64 now = sched_clock(); - s64 delta = now - hwc->freq_stamp; - - hwc->freq_stamp = now; - - if (delta > 0 && delta < TICK_NSEC) - perf_adjust_period(counter, NSEC_PER_SEC / (int)delta); - } - - /* - * XXX event_limit might not quite work as expected on inherited - * counters - */ - - counter->pending_kill = POLL_IN; - if (events && atomic_dec_and_test(&counter->event_limit)) { - ret = 1; - counter->pending_kill = POLL_HUP; - if (nmi) { - counter->pending_disable = 1; - perf_pending_queue(&counter->pending, - perf_pending_counter); - } else - perf_counter_disable(counter); - } - - perf_counter_output(counter, nmi, data); - return ret; -} - -/* - * Generic software counter infrastructure - */ - -/* - * We directly increment counter->count and keep a second value in - * counter->hw.period_left to count intervals. This period counter - * is kept in the range [-sample_period, 0] so that we can use the - * sign as trigger. - */ - -static u64 perf_swcounter_set_period(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 period = hwc->last_period; - u64 nr, offset; - s64 old, val; - - hwc->last_period = hwc->sample_period; - -again: - old = val = atomic64_read(&hwc->period_left); - if (val < 0) - return 0; - - nr = div64_u64(period + val, period); - offset = nr * period; - val -= offset; - if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) - goto again; - - return nr; -} - -static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct perf_sample_data *data) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 overflow; - - data->period = counter->hw.last_period; - overflow = perf_swcounter_set_period(counter); - - if (hwc->interrupts == MAX_INTERRUPTS) - return; - - for (; overflow; overflow--) { - if (perf_counter_overflow(counter, nmi, data)) { - /* - * We inhibit the overflow from happening when - * hwc->interrupts == MAX_INTERRUPTS. - */ - break; - } - } -} - -static void perf_swcounter_unthrottle(struct perf_counter *counter) -{ - /* - * Nothing to do, we already reset hwc->interrupts. - */ -} - -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct perf_sample_data *data) -{ - struct hw_perf_counter *hwc = &counter->hw; - - atomic64_add(nr, &counter->count); - - if (!hwc->sample_period) - return; - - if (!data->regs) - return; - - if (!atomic64_add_negative(nr, &hwc->period_left)) - perf_swcounter_overflow(counter, nmi, data); -} - -static int perf_swcounter_is_counting(struct perf_counter *counter) -{ - /* - * The counter is active, we're good! - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - return 1; - - /* - * The counter is off/error, not counting. - */ - if (counter->state != PERF_COUNTER_STATE_INACTIVE) - return 0; - - /* - * The counter is inactive, if the context is active - * we're part of a group that didn't make it on the 'pmu', - * not counting. - */ - if (counter->ctx->is_active) - return 0; - - /* - * We're inactive and the context is too, this means the - * task is scheduled out, we're counting events that happen - * to us, like migration events. - */ - return 1; -} - -static int perf_swcounter_match(struct perf_counter *counter, - enum perf_type_id type, - u32 event, struct pt_regs *regs) -{ - if (!perf_swcounter_is_counting(counter)) - return 0; - - if (counter->attr.type != type) - return 0; - if (counter->attr.config != event) - return 0; - - if (regs) { - if (counter->attr.exclude_user && user_mode(regs)) - return 0; - - if (counter->attr.exclude_kernel && !user_mode(regs)) - return 0; - } - - return 1; -} - -static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, - enum perf_type_id type, - u32 event, u64 nr, int nmi, - struct perf_sample_data *data) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, type, event, data->regs)) - perf_swcounter_add(counter, nr, nmi, data); - } - rcu_read_unlock(); -} - -static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) -{ - if (in_nmi()) - return &cpuctx->recursion[3]; - - if (in_irq()) - return &cpuctx->recursion[2]; - - if (in_softirq()) - return &cpuctx->recursion[1]; - - return &cpuctx->recursion[0]; -} - -static void do_perf_swcounter_event(enum perf_type_id type, u32 event, - u64 nr, int nmi, - struct perf_sample_data *data) -{ - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - int *recursion = perf_swcounter_recursion_context(cpuctx); - struct perf_counter_context *ctx; - - if (*recursion) - goto out; - - (*recursion)++; - barrier(); - - perf_swcounter_ctx_event(&cpuctx->ctx, type, event, - nr, nmi, data); - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data); - rcu_read_unlock(); - - barrier(); - (*recursion)--; - -out: - put_cpu_var(perf_cpu_context); -} - -void __perf_swcounter_event(u32 event, u64 nr, int nmi, - struct pt_regs *regs, u64 addr) -{ - struct perf_sample_data data = { - .regs = regs, - .addr = addr, - }; - - do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data); -} - -static void perf_swcounter_read(struct perf_counter *counter) -{ -} - -static int perf_swcounter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - - if (hwc->sample_period) { - hwc->last_period = hwc->sample_period; - perf_swcounter_set_period(counter); - } - return 0; -} - -static void perf_swcounter_disable(struct perf_counter *counter) -{ -} - -static const struct pmu perf_ops_generic = { - .enable = perf_swcounter_enable, - .disable = perf_swcounter_disable, - .read = perf_swcounter_read, - .unthrottle = perf_swcounter_unthrottle, -}; - -/* - * hrtimer based swcounter callback - */ - -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) -{ - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct perf_counter *counter; - u64 period; - - counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->pmu->read(counter); - - data.addr = 0; - data.regs = get_irq_regs(); - /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. - */ - if ((counter->attr.exclude_kernel || !data.regs) && - !counter->attr.exclude_user) - data.regs = task_pt_regs(current); - - if (data.regs) { - if (perf_counter_overflow(counter, 0, &data)) - ret = HRTIMER_NORESTART; - } - - period = max_t(u64, 10000, counter->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - - return ret; -} - -/* - * Software counter: cpu wall time clock - */ - -static void cpu_clock_perf_counter_update(struct perf_counter *counter) -{ - int cpu = raw_smp_processor_id(); - s64 prev; - u64 now; - - now = cpu_clock(cpu); - prev = atomic64_read(&counter->hw.prev_count); - atomic64_set(&counter->hw.prev_count, now); - atomic64_add(now - prev, &counter->count); -} - -static int cpu_clock_perf_counter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - int cpu = raw_smp_processor_id(); - - atomic64_set(&hwc->prev_count, cpu_clock(cpu)); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } - - return 0; -} - -static void cpu_clock_perf_counter_disable(struct perf_counter *counter) -{ - if (counter->hw.sample_period) - hrtimer_cancel(&counter->hw.hrtimer); - cpu_clock_perf_counter_update(counter); -} - -static void cpu_clock_perf_counter_read(struct perf_counter *counter) -{ - cpu_clock_perf_counter_update(counter); -} - -static const struct pmu perf_ops_cpu_clock = { - .enable = cpu_clock_perf_counter_enable, - .disable = cpu_clock_perf_counter_disable, - .read = cpu_clock_perf_counter_read, -}; - -/* - * Software counter: task time clock - */ - -static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) -{ - u64 prev; - s64 delta; - - prev = atomic64_xchg(&counter->hw.prev_count, now); - delta = now - prev; - atomic64_add(delta, &counter->count); -} - -static int task_clock_perf_counter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 now; - - now = counter->ctx->time; - - atomic64_set(&hwc->prev_count, now); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } - - return 0; -} - -static void task_clock_perf_counter_disable(struct perf_counter *counter) -{ - if (counter->hw.sample_period) - hrtimer_cancel(&counter->hw.hrtimer); - task_clock_perf_counter_update(counter, counter->ctx->time); - -} - -static void task_clock_perf_counter_read(struct perf_counter *counter) -{ - u64 time; - - if (!in_nmi()) { - update_context_time(counter->ctx); - time = counter->ctx->time; - } else { - u64 now = perf_clock(); - u64 delta = now - counter->ctx->timestamp; - time = counter->ctx->time + delta; - } - - task_clock_perf_counter_update(counter, time); -} - -static const struct pmu perf_ops_task_clock = { - .enable = task_clock_perf_counter_enable, - .disable = task_clock_perf_counter_disable, - .read = task_clock_perf_counter_read, -}; - -#ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, - int entry_size) -{ - struct perf_raw_record raw = { - .size = entry_size, - .data = record, - }; - - struct perf_sample_data data = { - .regs = get_irq_regs(), - .addr = addr, - .raw = &raw, - }; - - if (!data.regs) - data.regs = task_pt_regs(current); - - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data); -} -EXPORT_SYMBOL_GPL(perf_tpcounter_event); - -extern int ftrace_profile_enable(int); -extern void ftrace_profile_disable(int); - -static void tp_perf_counter_destroy(struct perf_counter *counter) -{ - ftrace_profile_disable(counter->attr.config); -} - -static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) -{ - /* - * Raw tracepoint data is a severe data leak, only allow root to - * have these. - */ - if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && - !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - - if (ftrace_profile_enable(counter->attr.config)) - return NULL; - - counter->destroy = tp_perf_counter_destroy; - - return &perf_ops_generic; -} -#else -static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) -{ - return NULL; -} -#endif - -atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; - -static void sw_perf_counter_destroy(struct perf_counter *counter) -{ - u64 event = counter->attr.config; - - WARN_ON(counter->parent); - - atomic_dec(&perf_swcounter_enabled[event]); -} - -static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) -{ - const struct pmu *pmu = NULL; - u64 event = counter->attr.config; - - /* - * Software counters (currently) can't in general distinguish - * between user, kernel and hypervisor events. - * However, context switches and cpu migrations are considered - * to be kernel events, and page faults are never hypervisor - * events. - */ - switch (event) { - case PERF_COUNT_SW_CPU_CLOCK: - pmu = &perf_ops_cpu_clock; - - break; - case PERF_COUNT_SW_TASK_CLOCK: - /* - * If the user instantiates this as a per-cpu counter, - * use the cpu_clock counter instead. - */ - if (counter->ctx->task) - pmu = &perf_ops_task_clock; - else - pmu = &perf_ops_cpu_clock; - - break; - case PERF_COUNT_SW_PAGE_FAULTS: - case PERF_COUNT_SW_PAGE_FAULTS_MIN: - case PERF_COUNT_SW_PAGE_FAULTS_MAJ: - case PERF_COUNT_SW_CONTEXT_SWITCHES: - case PERF_COUNT_SW_CPU_MIGRATIONS: - if (!counter->parent) { - atomic_inc(&perf_swcounter_enabled[event]); - counter->destroy = sw_perf_counter_destroy; - } - pmu = &perf_ops_generic; - break; - } - - return pmu; -} - -/* - * Allocate and initialize a counter structure - */ -static struct perf_counter * -perf_counter_alloc(struct perf_counter_attr *attr, - int cpu, - struct perf_counter_context *ctx, - struct perf_counter *group_leader, - struct perf_counter *parent_counter, - gfp_t gfpflags) -{ - const struct pmu *pmu; - struct perf_counter *counter; - struct hw_perf_counter *hwc; - long err; - - counter = kzalloc(sizeof(*counter), gfpflags); - if (!counter) - return ERR_PTR(-ENOMEM); - - /* - * Single counters are their own group leaders, with an - * empty sibling list: - */ - if (!group_leader) - group_leader = counter; - - mutex_init(&counter->child_mutex); - INIT_LIST_HEAD(&counter->child_list); - - INIT_LIST_HEAD(&counter->list_entry); - INIT_LIST_HEAD(&counter->event_entry); - INIT_LIST_HEAD(&counter->sibling_list); - init_waitqueue_head(&counter->waitq); - - mutex_init(&counter->mmap_mutex); - - counter->cpu = cpu; - counter->attr = *attr; - counter->group_leader = group_leader; - counter->pmu = NULL; - counter->ctx = ctx; - counter->oncpu = -1; - - counter->parent = parent_counter; - - counter->ns = get_pid_ns(current->nsproxy->pid_ns); - counter->id = atomic64_inc_return(&perf_counter_id); - - counter->state = PERF_COUNTER_STATE_INACTIVE; - - if (attr->disabled) - counter->state = PERF_COUNTER_STATE_OFF; - - pmu = NULL; - - hwc = &counter->hw; - hwc->sample_period = attr->sample_period; - if (attr->freq && attr->sample_freq) - hwc->sample_period = 1; - hwc->last_period = hwc->sample_period; - - atomic64_set(&hwc->period_left, hwc->sample_period); - - /* - * we currently do not support PERF_FORMAT_GROUP on inherited counters - */ - if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) - goto done; - - switch (attr->type) { - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - case PERF_TYPE_HW_CACHE: - pmu = hw_perf_counter_init(counter); - break; - - case PERF_TYPE_SOFTWARE: - pmu = sw_perf_counter_init(counter); - break; - - case PERF_TYPE_TRACEPOINT: - pmu = tp_perf_counter_init(counter); - break; - - default: - break; - } -done: - err = 0; - if (!pmu) - err = -EINVAL; - else if (IS_ERR(pmu)) - err = PTR_ERR(pmu); - - if (err) { - if (counter->ns) - put_pid_ns(counter->ns); - kfree(counter); - return ERR_PTR(err); - } - - counter->pmu = pmu; - - if (!counter->parent) { - atomic_inc(&nr_counters); - if (counter->attr.mmap) - atomic_inc(&nr_mmap_counters); - if (counter->attr.comm) - atomic_inc(&nr_comm_counters); - if (counter->attr.task) - atomic_inc(&nr_task_counters); - } - - return counter; -} - -static int perf_copy_attr(struct perf_counter_attr __user *uattr, - struct perf_counter_attr *attr) -{ - int ret; - u32 size; - - if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ - memset(attr, 0, sizeof(*attr)); - - ret = get_user(size, &uattr->size); - if (ret) - return ret; - - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ - size = PERF_ATTR_SIZE_VER0; - - if (size < PERF_ATTR_SIZE_VER0) - goto err_size; - - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0. - */ - if (size > sizeof(*attr)) { - unsigned long val; - unsigned long __user *addr; - unsigned long __user *end; - - addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr), - sizeof(unsigned long)); - end = PTR_ALIGN((void __user *)uattr + size, - sizeof(unsigned long)); - - for (; addr < end; addr += sizeof(unsigned long)) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - } - - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - - /* - * If the type exists, the corresponding creation will verify - * the attr->config. - */ - if (attr->type >= PERF_TYPE_MAX) - return -EINVAL; - - if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) - return -EINVAL; - - if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) - return -EINVAL; - - if (attr->read_format & ~(PERF_FORMAT_MAX-1)) - return -EINVAL; - -out: - return ret; - -err_size: - put_user(sizeof(*attr), &uattr->size); - ret = -E2BIG; - goto out; -} - -/** - * sys_perf_counter_open - open a performance counter, associate it to a task/cpu - * - * @attr_uptr: event type attributes for monitoring/sampling - * @pid: target pid - * @cpu: target cpu - * @group_fd: group leader counter fd - */ -SYSCALL_DEFINE5(perf_counter_open, - struct perf_counter_attr __user *, attr_uptr, - pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) -{ - struct perf_counter *counter, *group_leader; - struct perf_counter_attr attr; - struct perf_counter_context *ctx; - struct file *counter_file = NULL; - struct file *group_file = NULL; - int fput_needed = 0; - int fput_needed2 = 0; - int ret; - - /* for future expandability... */ - if (flags) - return -EINVAL; - - ret = perf_copy_attr(attr_uptr, &attr); - if (ret) - return ret; - - if (!attr.exclude_kernel) { - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; - } - - if (attr.freq) { - if (attr.sample_freq > sysctl_perf_counter_sample_rate) - return -EINVAL; - } - - /* - * Get the target context (task or percpu): - */ - ctx = find_get_context(pid, cpu); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - /* - * Look up the group leader (we will attach this counter to it): - */ - group_leader = NULL; - if (group_fd != -1) { - ret = -EINVAL; - group_file = fget_light(group_fd, &fput_needed); - if (!group_file) - goto err_put_context; - if (group_file->f_op != &perf_fops) - goto err_put_context; - - group_leader = group_file->private_data; - /* - * Do not allow a recursive hierarchy (this new sibling - * becoming part of another group-sibling): - */ - if (group_leader->group_leader != group_leader) - goto err_put_context; - /* - * Do not allow to attach to a group in a different - * task or CPU context: - */ - if (group_leader->ctx != ctx) - goto err_put_context; - /* - * Only a group leader can be exclusive or pinned - */ - if (attr.exclusive || attr.pinned) - goto err_put_context; - } - - counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, - NULL, GFP_KERNEL); - ret = PTR_ERR(counter); - if (IS_ERR(counter)) - goto err_put_context; - - ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); - if (ret < 0) - goto err_free_put_context; - - counter_file = fget_light(ret, &fput_needed2); - if (!counter_file) - goto err_free_put_context; - - counter->filp = counter_file; - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_install_in_context(ctx, counter, cpu); - ++ctx->generation; - mutex_unlock(&ctx->mutex); - - counter->owner = current; - get_task_struct(current); - mutex_lock(¤t->perf_counter_mutex); - list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); - mutex_unlock(¤t->perf_counter_mutex); - - fput_light(counter_file, fput_needed2); - -out_fput: - fput_light(group_file, fput_needed); - - return ret; - -err_free_put_context: - kfree(counter); - -err_put_context: - put_ctx(ctx); - - goto out_fput; -} - -/* - * inherit a counter from parent task to child task: - */ -static struct perf_counter * -inherit_counter(struct perf_counter *parent_counter, - struct task_struct *parent, - struct perf_counter_context *parent_ctx, - struct task_struct *child, - struct perf_counter *group_leader, - struct perf_counter_context *child_ctx) -{ - struct perf_counter *child_counter; - - /* - * Instead of creating recursive hierarchies of counters, - * we link inherited counters back to the original parent, - * which has a filp for sure, which we use as the reference - * count: - */ - if (parent_counter->parent) - parent_counter = parent_counter->parent; - - child_counter = perf_counter_alloc(&parent_counter->attr, - parent_counter->cpu, child_ctx, - group_leader, parent_counter, - GFP_KERNEL); - if (IS_ERR(child_counter)) - return child_counter; - get_ctx(child_ctx); - - /* - * Make the child state follow the state of the parent counter, - * not its attr.disabled bit. We hold the parent's mutex, - * so we won't race with perf_counter_{en, dis}able_family. - */ - if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) - child_counter->state = PERF_COUNTER_STATE_INACTIVE; - else - child_counter->state = PERF_COUNTER_STATE_OFF; - - if (parent_counter->attr.freq) - child_counter->hw.sample_period = parent_counter->hw.sample_period; - - /* - * Link it up in the child's context: - */ - add_counter_to_ctx(child_counter, child_ctx); - - /* - * Get a reference to the parent filp - we will fput it - * when the child counter exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_counter->filp->f_count); - - /* - * Link this into the parent counter's child list - */ - WARN_ON_ONCE(parent_counter->ctx->parent_ctx); - mutex_lock(&parent_counter->child_mutex); - list_add_tail(&child_counter->child_list, &parent_counter->child_list); - mutex_unlock(&parent_counter->child_mutex); - - return child_counter; -} - -static int inherit_group(struct perf_counter *parent_counter, - struct task_struct *parent, - struct perf_counter_context *parent_ctx, - struct task_struct *child, - struct perf_counter_context *child_ctx) -{ - struct perf_counter *leader; - struct perf_counter *sub; - struct perf_counter *child_ctr; - - leader = inherit_counter(parent_counter, parent, parent_ctx, - child, NULL, child_ctx); - if (IS_ERR(leader)) - return PTR_ERR(leader); - list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { - child_ctr = inherit_counter(sub, parent, parent_ctx, - child, leader, child_ctx); - if (IS_ERR(child_ctr)) - return PTR_ERR(child_ctr); - } - return 0; -} - -static void sync_child_counter(struct perf_counter *child_counter, - struct task_struct *child) -{ - struct perf_counter *parent_counter = child_counter->parent; - u64 child_val; - - if (child_counter->attr.inherit_stat) - perf_counter_read_event(child_counter, child); - - child_val = atomic64_read(&child_counter->count); - - /* - * Add back the child's count to the parent's count: - */ - atomic64_add(child_val, &parent_counter->count); - atomic64_add(child_counter->total_time_enabled, - &parent_counter->child_total_time_enabled); - atomic64_add(child_counter->total_time_running, - &parent_counter->child_total_time_running); - - /* - * Remove this counter from the parent's list - */ - WARN_ON_ONCE(parent_counter->ctx->parent_ctx); - mutex_lock(&parent_counter->child_mutex); - list_del_init(&child_counter->child_list); - mutex_unlock(&parent_counter->child_mutex); - - /* - * Release the parent counter, if this was the last - * reference to it. - */ - fput(parent_counter->filp); -} - -static void -__perf_counter_exit_task(struct perf_counter *child_counter, - struct perf_counter_context *child_ctx, - struct task_struct *child) -{ - struct perf_counter *parent_counter; - - update_counter_times(child_counter); - perf_counter_remove_from_context(child_counter); - - parent_counter = child_counter->parent; - /* - * It can happen that parent exits first, and has counters - * that are still around due to the child reference. These - * counters need to be zapped - but otherwise linger. - */ - if (parent_counter) { - sync_child_counter(child_counter, child); - free_counter(child_counter); - } -} - -/* - * When a child task exits, feed back counter values to parent counters. - */ -void perf_counter_exit_task(struct task_struct *child) -{ - struct perf_counter *child_counter, *tmp; - struct perf_counter_context *child_ctx; - unsigned long flags; - - if (likely(!child->perf_counter_ctxp)) { - perf_counter_task(child, NULL, 0); - return; - } - - local_irq_save(flags); - /* - * We can't reschedule here because interrupts are disabled, - * and either child is current or it is a task that can't be - * scheduled, so we are now safe from rescheduling changing - * our context. - */ - child_ctx = child->perf_counter_ctxp; - __perf_counter_task_sched_out(child_ctx); - - /* - * Take the context lock here so that if find_get_context is - * reading child->perf_counter_ctxp, we wait until it has - * incremented the context's refcount before we do put_ctx below. - */ - spin_lock(&child_ctx->lock); - child->perf_counter_ctxp = NULL; - /* - * If this context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the counters from it. - */ - unclone_ctx(child_ctx); - spin_unlock_irqrestore(&child_ctx->lock, flags); - - /* - * Report the task dead after unscheduling the counters so that we - * won't get any samples after PERF_EVENT_EXIT. We can however still - * get a few PERF_EVENT_READ events. - */ - perf_counter_task(child, child_ctx, 0); - - /* - * We can recurse on the same lock type through: - * - * __perf_counter_exit_task() - * sync_child_counter() - * fput(parent_counter->filp) - * perf_release() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. - */ - mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); - -again: - list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, - list_entry) - __perf_counter_exit_task(child_counter, child_ctx, child); - - /* - * If the last counter was a group counter, it will have appended all - * its siblings to the list, but we obtained 'tmp' before that which - * will still point to the list head terminating the iteration. - */ - if (!list_empty(&child_ctx->counter_list)) - goto again; - - mutex_unlock(&child_ctx->mutex); - - put_ctx(child_ctx); -} - -/* - * free an unexposed, unused context as created by inheritance by - * init_task below, used by fork() in case of fail. - */ -void perf_counter_free_task(struct task_struct *task) -{ - struct perf_counter_context *ctx = task->perf_counter_ctxp; - struct perf_counter *counter, *tmp; - - if (!ctx) - return; - - mutex_lock(&ctx->mutex); -again: - list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) { - struct perf_counter *parent = counter->parent; - - if (WARN_ON_ONCE(!parent)) - continue; - - mutex_lock(&parent->child_mutex); - list_del_init(&counter->child_list); - mutex_unlock(&parent->child_mutex); - - fput(parent->filp); - - list_del_counter(counter, ctx); - free_counter(counter); - } - - if (!list_empty(&ctx->counter_list)) - goto again; - - mutex_unlock(&ctx->mutex); - - put_ctx(ctx); -} - -/* - * Initialize the perf_counter context in task_struct - */ -int perf_counter_init_task(struct task_struct *child) -{ - struct perf_counter_context *child_ctx, *parent_ctx; - struct perf_counter_context *cloned_ctx; - struct perf_counter *counter; - struct task_struct *parent = current; - int inherited_all = 1; - int ret = 0; - - child->perf_counter_ctxp = NULL; - - mutex_init(&child->perf_counter_mutex); - INIT_LIST_HEAD(&child->perf_counter_list); - - if (likely(!parent->perf_counter_ctxp)) - return 0; - - /* - * This is executed from the parent task context, so inherit - * counters that have been marked for cloning. - * First allocate and initialize a context for the child. - */ - - child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); - if (!child_ctx) - return -ENOMEM; - - __perf_counter_init_context(child_ctx, child); - child->perf_counter_ctxp = child_ctx; - get_task_struct(child); - - /* - * If the parent's context is a clone, pin it so it won't get - * swapped under us. - */ - parent_ctx = perf_pin_task_context(parent); - - /* - * No need to check if parent_ctx != NULL here; since we saw - * it non-NULL earlier, the only reason for it to become NULL - * is if we exit, and since we're currently in the middle of - * a fork we can't be exiting at the same time. - */ - - /* - * Lock the parent list. No need to lock the child - not PID - * hashed yet and not running, so nobody can access it. - */ - mutex_lock(&parent_ctx->mutex); - - /* - * We dont have to disable NMIs - we are only looking at - * the list, not manipulating it: - */ - list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) { - if (counter != counter->group_leader) - continue; - - if (!counter->attr.inherit) { - inherited_all = 0; - continue; - } - - ret = inherit_group(counter, parent, parent_ctx, - child, child_ctx); - if (ret) { - inherited_all = 0; - break; - } - } - - if (inherited_all) { - /* - * Mark the child context as a clone of the parent - * context, or of whatever the parent is a clone of. - * Note that if the parent is a clone, it could get - * uncloned at any point, but that doesn't matter - * because the list of counters and the generation - * count can't have changed since we took the mutex. - */ - cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); - if (cloned_ctx) { - child_ctx->parent_ctx = cloned_ctx; - child_ctx->parent_gen = parent_ctx->parent_gen; - } else { - child_ctx->parent_ctx = parent_ctx; - child_ctx->parent_gen = parent_ctx->generation; - } - get_ctx(child_ctx->parent_ctx); - } - - mutex_unlock(&parent_ctx->mutex); - - perf_unpin_context(parent_ctx); - - return ret; -} - -static void __cpuinit perf_counter_init_cpu(int cpu) -{ - struct perf_cpu_context *cpuctx; - - cpuctx = &per_cpu(perf_cpu_context, cpu); - __perf_counter_init_context(&cpuctx->ctx, NULL); - - spin_lock(&perf_resource_lock); - cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; - spin_unlock(&perf_resource_lock); - - hw_perf_counter_setup(cpu); -} - -#ifdef CONFIG_HOTPLUG_CPU -static void __perf_counter_exit_cpu(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = &cpuctx->ctx; - struct perf_counter *counter, *tmp; - - list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) - __perf_counter_remove_from_context(counter); -} -static void perf_counter_exit_cpu(int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &cpuctx->ctx; - - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); - mutex_unlock(&ctx->mutex); -} -#else -static inline void perf_counter_exit_cpu(int cpu) { } -#endif - -static int __cpuinit -perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action) { - - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - perf_counter_init_cpu(cpu); - break; - - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - hw_perf_counter_setup_online(cpu); - break; - - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - perf_counter_exit_cpu(cpu); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -/* - * This has to have a higher priority than migration_notifier in sched.c. - */ -static struct notifier_block __cpuinitdata perf_cpu_nb = { - .notifier_call = perf_cpu_notify, - .priority = 20, -}; - -void __init perf_counter_init(void) -{ - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, - (void *)(long)smp_processor_id()); - register_cpu_notifier(&perf_cpu_nb); -} - -static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) -{ - return sprintf(buf, "%d\n", perf_reserved_percpu); -} - -static ssize_t -perf_set_reserve_percpu(struct sysdev_class *class, - const char *buf, - size_t count) -{ - struct perf_cpu_context *cpuctx; - unsigned long val; - int err, cpu, mpt; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > perf_max_counters) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_reserved_percpu = val; - for_each_online_cpu(cpu) { - cpuctx = &per_cpu(perf_cpu_context, cpu); - spin_lock_irq(&cpuctx->ctx.lock); - mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, - perf_max_counters - perf_reserved_percpu); - cpuctx->max_pertask = mpt; - spin_unlock_irq(&cpuctx->ctx.lock); - } - spin_unlock(&perf_resource_lock); - - return count; -} - -static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) -{ - return sprintf(buf, "%d\n", perf_overcommit); -} - -static ssize_t -perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) -{ - unsigned long val; - int err; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > 1) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_overcommit = val; - spin_unlock(&perf_resource_lock); - - return count; -} - -static SYSDEV_CLASS_ATTR( - reserve_percpu, - 0644, - perf_show_reserve_percpu, - perf_set_reserve_percpu - ); - -static SYSDEV_CLASS_ATTR( - overcommit, - 0644, - perf_show_overcommit, - perf_set_overcommit - ); - -static struct attribute *perfclass_attrs[] = { - &attr_reserve_percpu.attr, - &attr_overcommit.attr, - NULL -}; - -static struct attribute_group perfclass_attr_group = { - .attrs = perfclass_attrs, - .name = "perf_counters", -}; - -static int __init perf_counter_sysfs_init(void) -{ - return sysfs_create_group(&cpu_sysdev_class.kset.kobj, - &perfclass_attr_group); -} -device_initcall(perf_counter_sysfs_init); diff --git a/kernel/perf_event.c b/kernel/perf_event.c new file mode 100644 index 0000000..76ac4db --- /dev/null +++ b/kernel/perf_event.c @@ -0,0 +1,5000 @@ +/* + * Performance events core code: + * + * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * For licensing details see kernel-base/COPYING + */ + +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/file.h> +#include <linux/poll.h> +#include <linux/sysfs.h> +#include <linux/dcache.h> +#include <linux/percpu.h> +#include <linux/ptrace.h> +#include <linux/vmstat.h> +#include <linux/hardirq.h> +#include <linux/rculist.h> +#include <linux/uaccess.h> +#include <linux/syscalls.h> +#include <linux/anon_inodes.h> +#include <linux/kernel_stat.h> +#include <linux/perf_event.h> + +#include <asm/irq_regs.h> + +/* + * Each CPU has a list of per CPU events: + */ +DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +int perf_max_events __read_mostly = 1; +static int perf_reserved_percpu __read_mostly; +static int perf_overcommit __read_mostly = 1; + +static atomic_t nr_events __read_mostly; +static atomic_t nr_mmap_events __read_mostly; +static atomic_t nr_comm_events __read_mostly; +static atomic_t nr_task_events __read_mostly; + +/* + * perf event paranoia level: + * -1 - not paranoid at all + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu events for unpriv + * 2 - disallow kernel profiling for unpriv + */ +int sysctl_perf_event_paranoid __read_mostly = 1; + +static inline bool perf_paranoid_tracepoint_raw(void) +{ + return sysctl_perf_event_paranoid > -1; +} + +static inline bool perf_paranoid_cpu(void) +{ + return sysctl_perf_event_paranoid > 0; +} + +static inline bool perf_paranoid_kernel(void) +{ + return sysctl_perf_event_paranoid > 1; +} + +int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ + +/* + * max perf event sample rate + */ +int sysctl_perf_event_sample_rate __read_mostly = 100000; + +static atomic64_t perf_event_id; + +/* + * Lock for (sysadmin-configurable) event reservations: + */ +static DEFINE_SPINLOCK(perf_resource_lock); + +/* + * Architecture provided APIs - weak aliases: + */ +extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) +{ + return NULL; +} + +void __weak hw_perf_disable(void) { barrier(); } +void __weak hw_perf_enable(void) { barrier(); } + +void __weak hw_perf_event_setup(int cpu) { barrier(); } +void __weak hw_perf_event_setup_online(int cpu) { barrier(); } + +int __weak +hw_perf_group_sched_in(struct perf_event *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, int cpu) +{ + return 0; +} + +void __weak perf_event_print_debug(void) { } + +static DEFINE_PER_CPU(int, perf_disable_count); + +void __perf_disable(void) +{ + __get_cpu_var(perf_disable_count)++; +} + +bool __perf_enable(void) +{ + return !--__get_cpu_var(perf_disable_count); +} + +void perf_disable(void) +{ + __perf_disable(); + hw_perf_disable(); +} + +void perf_enable(void) +{ + if (__perf_enable()) + hw_perf_enable(); +} + +static void get_ctx(struct perf_event_context *ctx) +{ + WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); +} + +static void free_ctx(struct rcu_head *head) +{ + struct perf_event_context *ctx; + + ctx = container_of(head, struct perf_event_context, rcu_head); + kfree(ctx); +} + +static void put_ctx(struct perf_event_context *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) { + if (ctx->parent_ctx) + put_ctx(ctx->parent_ctx); + if (ctx->task) + put_task_struct(ctx->task); + call_rcu(&ctx->rcu_head, free_ctx); + } +} + +static void unclone_ctx(struct perf_event_context *ctx) +{ + if (ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } +} + +/* + * If we inherit events we want to return the parent event id + * to userspace. + */ +static u64 primary_event_id(struct perf_event *event) +{ + u64 id = event->id; + + if (event->parent) + id = event->parent->id; + + return id; +} + +/* + * Get the perf_event_context for a task and lock it. + * This has to cope with with the fact that until it is locked, + * the context could get moved to another task. + */ +static struct perf_event_context * +perf_lock_task_context(struct task_struct *task, unsigned long *flags) +{ + struct perf_event_context *ctx; + + rcu_read_lock(); + retry: + ctx = rcu_dereference(task->perf_event_ctxp); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_event_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more. + */ + spin_lock_irqsave(&ctx->lock, *flags); + if (ctx != rcu_dereference(task->perf_event_ctxp)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + goto retry; + } + + if (!atomic_inc_not_zero(&ctx->refcount)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + ctx = NULL; + } + } + rcu_read_unlock(); + return ctx; +} + +/* + * Get the context for a task and increment its pin_count so it + * can't get swapped to another task. This also increments its + * reference count so that the context can't get freed. + */ +static struct perf_event_context *perf_pin_task_context(struct task_struct *task) +{ + struct perf_event_context *ctx; + unsigned long flags; + + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + ++ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); + } + return ctx; +} + +static void perf_unpin_context(struct perf_event_context *ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->lock, flags); + --ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); + put_ctx(ctx); +} + +/* + * Add a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_add_event(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event *group_leader = event->group_leader; + + /* + * Depending on whether it is a standalone or sibling event, + * add it straight to the context's event list, or to the group + * leader's sibling list: + */ + if (group_leader == event) + list_add_tail(&event->group_entry, &ctx->group_list); + else { + list_add_tail(&event->group_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; + } + + list_add_rcu(&event->event_entry, &ctx->event_list); + ctx->nr_events++; + if (event->attr.inherit_stat) + ctx->nr_stat++; +} + +/* + * Remove a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_del_event(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event *sibling, *tmp; + + if (list_empty(&event->group_entry)) + return; + ctx->nr_events--; + if (event->attr.inherit_stat) + ctx->nr_stat--; + + list_del_init(&event->group_entry); + list_del_rcu(&event->event_entry); + + if (event->group_leader != event) + event->group_leader->nr_siblings--; + + /* + * If this was a group event with sibling events then + * upgrade the siblings to singleton events by adding them + * to the context list directly: + */ + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { + + list_move_tail(&sibling->group_entry, &ctx->group_list); + sibling->group_leader = sibling; + } +} + +static void +event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { + event->pending_disable = 0; + event->state = PERF_EVENT_STATE_OFF; + } + event->tstamp_stopped = ctx->time; + event->pmu->disable(event); + event->oncpu = -1; + + if (!is_software_event(event)) + cpuctx->active_oncpu--; + ctx->nr_active--; + if (event->attr.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + +static void +group_sched_out(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event; + + if (group_event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event_sched_out(group_event, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) + event_sched_out(event, cpuctx, ctx); + + if (group_event->attr.exclusive) + cpuctx->exclusive = 0; +} + +/* + * Cross CPU call to remove a performance event + * + * We disable the event on the hardware level first. After that we + * remove it from the context list. + */ +static void __perf_event_remove_from_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + /* + * Protect the list operation against NMI by disabling the + * events on a global level. + */ + perf_disable(); + + event_sched_out(event, cpuctx, ctx); + + list_del_event(event, ctx); + + if (!ctx->task) { + /* + * Allow more per task events with respect to the + * reservation: + */ + cpuctx->max_pertask = + min(perf_max_events - ctx->nr_events, + perf_max_events - perf_reserved_percpu); + } + + perf_enable(); + spin_unlock(&ctx->lock); +} + + +/* + * Remove the event from a task's (or a CPU's) list of events. + * + * Must be called with ctx->mutex held. + * + * CPU events are removed with a smp call. For task events we only + * call when the task is on a CPU. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_event_exit_task, it's OK because the + * context has been detached from its task. + */ +static void perf_event_remove_from_context(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu events are removed via an smp call and + * the removal is always sucessful. + */ + smp_call_function_single(event->cpu, + __perf_event_remove_from_context, + event, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_event_remove_from_context, + event); + + spin_lock_irq(&ctx->lock); + /* + * If the context is active we need to retry the smp call. + */ + if (ctx->nr_active && !list_empty(&event->group_entry)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can remove the event safely, if the call above did not + * succeed. + */ + if (!list_empty(&event->group_entry)) { + list_del_event(event, ctx); + } + spin_unlock_irq(&ctx->lock); +} + +static inline u64 perf_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_event_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +/* + * Update the total_time_enabled and total_time_running fields for a event. + */ +static void update_event_times(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + u64 run_end; + + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) + return; + + event->total_time_enabled = ctx->time - event->tstamp_enabled; + + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; + else + run_end = ctx->time; + + event->total_time_running = run_end - event->tstamp_running; +} + +/* + * Update total_time_enabled and total_time_running for all events in a group. + */ +static void update_group_times(struct perf_event *leader) +{ + struct perf_event *event; + + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); +} + +/* + * Cross CPU call to disable a performance event + */ +static void __perf_event_disable(void *info) +{ + struct perf_event *event = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = event->ctx; + + /* + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + + /* + * If the event is on, turn it off. + * If it is in error state, leave it in error state. + */ + if (event->state >= PERF_EVENT_STATE_INACTIVE) { + update_context_time(ctx); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); + else + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; + } + + spin_unlock(&ctx->lock); +} + +/* + * Disable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisifed when called through + * perf_event_for_each_child or perf_event_for_each because they + * hold the top-level event's child_mutex, so any descendant that + * goes to exit will block in sync_child_event. + * When called from perf_pending_event it's OK because event->ctx + * is the current context on this CPU and preemption is disabled, + * hence we can't get into perf_event_task_sched_out for this context. + */ +static void perf_event_disable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Disable the event on the cpu that it's on + */ + smp_call_function_single(event->cpu, __perf_event_disable, + event, 1); + return; + } + + retry: + task_oncpu_function_call(task, __perf_event_disable, event); + + spin_lock_irq(&ctx->lock); + /* + * If the event is still active, we need to retry the cross-call. + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_OFF; + } + + spin_unlock_irq(&ctx->lock); +} + +static int +event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + int cpu) +{ + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + /* + * The new state must be visible before we turn it on in the hardware: + */ + smp_wmb(); + + if (event->pmu->enable(event)) { + event->state = PERF_EVENT_STATE_INACTIVE; + event->oncpu = -1; + return -EAGAIN; + } + + event->tstamp_running += ctx->time - event->tstamp_stopped; + + if (!is_software_event(event)) + cpuctx->active_oncpu++; + ctx->nr_active++; + + if (event->attr.exclusive) + cpuctx->exclusive = 1; + + return 0; +} + +static int +group_sched_in(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + int cpu) +{ + struct perf_event *event, *partial_group; + int ret; + + if (group_event->state == PERF_EVENT_STATE_OFF) + return 0; + + ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); + if (ret) + return ret < 0 ? ret : 0; + + if (event_sched_in(group_event, cpuctx, ctx, cpu)) + return -EAGAIN; + + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event_sched_in(event, cpuctx, ctx, cpu)) { + partial_group = event; + goto group_error; + } + } + + return 0; + +group_error: + /* + * Groups can be scheduled in as one unit only, so undo any + * partial group before returning: + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event == partial_group) + break; + event_sched_out(event, cpuctx, ctx); + } + event_sched_out(group_event, cpuctx, ctx); + + return -EAGAIN; +} + +/* + * Return 1 for a group consisting entirely of software events, + * 0 if the group contains any hardware events. + */ +static int is_software_only_group(struct perf_event *leader) +{ + struct perf_event *event; + + if (!is_software_event(leader)) + return 0; + + list_for_each_entry(event, &leader->sibling_list, group_entry) + if (!is_software_event(event)) + return 0; + + return 1; +} + +/* + * Work out whether we can put this event group on the CPU now. + */ +static int group_can_go_on(struct perf_event *event, + struct perf_cpu_context *cpuctx, + int can_add_hw) +{ + /* + * Groups consisting entirely of software events can always go on. + */ + if (is_software_only_group(event)) + return 1; + /* + * If an exclusive group is already on, no other hardware + * events can go on. + */ + if (cpuctx->exclusive) + return 0; + /* + * If this group is exclusive and there are already + * events on the CPU, it can't go on. + */ + if (event->attr.exclusive && cpuctx->active_oncpu) + return 0; + /* + * Otherwise, try to add it if all previous groups were able + * to go on. + */ + return can_add_hw; +} + +static void add_event_to_ctx(struct perf_event *event, + struct perf_event_context *ctx) +{ + list_add_event(event, ctx); + event->tstamp_enabled = ctx->time; + event->tstamp_running = ctx->time; + event->tstamp_stopped = ctx->time; +} + +/* + * Cross CPU call to install and enable a performance event + * + * Must be called with ctx->mutex held + */ +static void __perf_install_in_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + int cpu = smp_processor_id(); + int err; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + * Or possibly this is the right context but it isn't + * on this cpu because it had no events. + */ + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } + + spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + + /* + * Protect the list operation against NMI by disabling the + * events on a global level. NOP for non NMI based events. + */ + perf_disable(); + + add_event_to_ctx(event, ctx); + + /* + * Don't put the event on if it is disabled or if + * it is in a group and the group isn't on. + */ + if (event->state != PERF_EVENT_STATE_INACTIVE || + (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) + goto unlock; + + /* + * An exclusive event can't go on if there are already active + * hardware events, and no hardware event can go on if there + * is already an exclusive event on. + */ + if (!group_can_go_on(event, cpuctx, 1)) + err = -EEXIST; + else + err = event_sched_in(event, cpuctx, ctx, cpu); + + if (err) { + /* + * This event couldn't go on. If it is in a group + * then we have to pull the whole group off. + * If the event group is pinned then put it in error state. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + + if (!err && !ctx->task && cpuctx->max_pertask) + cpuctx->max_pertask--; + + unlock: + perf_enable(); + + spin_unlock(&ctx->lock); +} + +/* + * Attach a performance event to a context + * + * First we add the event to the list with the hardware enable bit + * in event->hw_config cleared. + * + * If the event is attached to a task which is on a CPU we use a smp + * call to enable it in the task context. The task might have been + * scheduled away, but we check this in the smp call again. + * + * Must be called with ctx->mutex held. + */ +static void +perf_install_in_context(struct perf_event_context *ctx, + struct perf_event *event, + int cpu) +{ + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu events are installed via an smp call and + * the install is always sucessful. + */ + smp_call_function_single(cpu, __perf_install_in_context, + event, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_install_in_context, + event); + + spin_lock_irq(&ctx->lock); + /* + * we need to retry the smp call. + */ + if (ctx->is_active && list_empty(&event->group_entry)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can add the event safely, if it the call above did not + * succeed. + */ + if (list_empty(&event->group_entry)) + add_event_to_ctx(event, ctx); + spin_unlock_irq(&ctx->lock); +} + +/* + * Put a event into inactive state and update time fields. + * Enabling the leader of a group effectively enables all + * the group members that aren't explicitly disabled, so we + * have to update their ->tstamp_enabled also. + * Note: this works for group members as well as group leaders + * since the non-leader members' sibling_lists will be empty. + */ +static void __perf_event_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + + event->state = PERF_EVENT_STATE_INACTIVE; + event->tstamp_enabled = ctx->time - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) + if (sub->state >= PERF_EVENT_STATE_INACTIVE) + sub->tstamp_enabled = + ctx->time - sub->total_time_enabled; +} + +/* + * Cross CPU call to enable a performance event + */ +static void __perf_event_enable(void *info) +{ + struct perf_event *event = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + int err; + + /* + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } + + spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto unlock; + __perf_event_mark_enabled(event, ctx); + + /* + * If the event is in a group and isn't the group leader, + * then don't put it on unless the group is on. + */ + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) + goto unlock; + + if (!group_can_go_on(event, cpuctx, 1)) { + err = -EEXIST; + } else { + perf_disable(); + if (event == leader) + err = group_sched_in(event, cpuctx, ctx, + smp_processor_id()); + else + err = event_sched_in(event, cpuctx, ctx, + smp_processor_id()); + perf_enable(); + } + + if (err) { + /* + * If this event can't go on and it's part of a + * group, then the whole group has to come off. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + + unlock: + spin_unlock(&ctx->lock); +} + +/* + * Enable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisfied when called through + * perf_event_for_each_child or perf_event_for_each as described + * for perf_event_disable. + */ +static void perf_event_enable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Enable the event on the cpu that it's on + */ + smp_call_function_single(event->cpu, __perf_event_enable, + event, 1); + return; + } + + spin_lock_irq(&ctx->lock); + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto out; + + /* + * If the event is in error state, clear that first. + * That way, if we see the event in error state below, we + * know that it has gone back into error state, as distinct + * from the task having been scheduled away before the + * cross-call arrived. + */ + if (event->state == PERF_EVENT_STATE_ERROR) + event->state = PERF_EVENT_STATE_OFF; + + retry: + spin_unlock_irq(&ctx->lock); + task_oncpu_function_call(task, __perf_event_enable, event); + + spin_lock_irq(&ctx->lock); + + /* + * If the context is active and the event is still off, + * we need to retry the cross-call. + */ + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) + goto retry; + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (event->state == PERF_EVENT_STATE_OFF) + __perf_event_mark_enabled(event, ctx); + + out: + spin_unlock_irq(&ctx->lock); +} + +static int perf_event_refresh(struct perf_event *event, int refresh) +{ + /* + * not supported on inherited events + */ + if (event->attr.inherit) + return -EINVAL; + + atomic_add(refresh, &event->event_limit); + perf_event_enable(event); + + return 0; +} + +void __perf_event_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct perf_event *event; + + spin_lock(&ctx->lock); + ctx->is_active = 0; + if (likely(!ctx->nr_events)) + goto out; + update_context_time(ctx); + + perf_disable(); + if (ctx->nr_active) { + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event != event->group_leader) + event_sched_out(event, cpuctx, ctx); + else + group_sched_out(event, cpuctx, ctx); + } + } + perf_enable(); + out: + spin_unlock(&ctx->lock); +} + +/* + * Test whether two contexts are equivalent, i.e. whether they + * have both been cloned from the same version of the same context + * and they both have the same number of enabled events. + * If the number of enabled events is the same, then the set + * of enabled events should be the same, because these are both + * inherited contexts, therefore we can't access individual events + * in them directly with an fd; we can only enable/disable all + * events via prctl, or enable/disable all events in a family + * via ioctl, which will have the same effect on both contexts. + */ +static int context_equiv(struct perf_event_context *ctx1, + struct perf_event_context *ctx2) +{ + return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx + && ctx1->parent_gen == ctx2->parent_gen + && !ctx1->pin_count && !ctx2->pin_count; +} + +static void __perf_event_read(void *event); + +static void __perf_event_sync_stat(struct perf_event *event, + struct perf_event *next_event) +{ + u64 value; + + if (!event->attr.inherit_stat) + return; + + /* + * Update the event value, we cannot use perf_event_read() + * because we're in the middle of a context switch and have IRQs + * disabled, which upsets smp_call_function_single(), however + * we know the event must be on the current CPU, therefore we + * don't need to use it. + */ + switch (event->state) { + case PERF_EVENT_STATE_ACTIVE: + __perf_event_read(event); + break; + + case PERF_EVENT_STATE_INACTIVE: + update_event_times(event); + break; + + default: + break; + } + + /* + * In order to keep per-task stats reliable we need to flip the event + * values when we flip the contexts. + */ + value = atomic64_read(&next_event->count); + value = atomic64_xchg(&event->count, value); + atomic64_set(&next_event->count, value); + + swap(event->total_time_enabled, next_event->total_time_enabled); + swap(event->total_time_running, next_event->total_time_running); + + /* + * Since we swizzled the values, update the user visible data too. + */ + perf_event_update_userpage(event); + perf_event_update_userpage(next_event); +} + +#define list_next_entry(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + +static void perf_event_sync_stat(struct perf_event_context *ctx, + struct perf_event_context *next_ctx) +{ + struct perf_event *event, *next_event; + + if (!ctx->nr_stat) + return; + + event = list_first_entry(&ctx->event_list, + struct perf_event, event_entry); + + next_event = list_first_entry(&next_ctx->event_list, + struct perf_event, event_entry); + + while (&event->event_entry != &ctx->event_list && + &next_event->event_entry != &next_ctx->event_list) { + + __perf_event_sync_stat(event, next_event); + + event = list_next_entry(event, event_entry); + next_event = list_next_entry(next_event, event_entry); + } +} + +/* + * Called from scheduler to remove the events of the current task, + * with interrupts disabled. + * + * We stop each event and update the event value in event->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * not restart the event. + */ +void perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *next_ctx; + struct perf_event_context *parent; + struct pt_regs *regs; + int do_switch = 1; + + regs = task_pt_regs(task); + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); + + if (likely(!ctx || !cpuctx->task_ctx)) + return; + + update_context_time(ctx); + + rcu_read_lock(); + parent = rcu_dereference(ctx->parent_ctx); + next_ctx = next->perf_event_ctxp; + if (parent && next_ctx && + rcu_dereference(next_ctx->parent_ctx) == parent) { + /* + * Looks like the two contexts are clones, so we might be + * able to optimize the context switch. We lock both + * contexts and check that they are clones under the + * lock (including re-checking that neither has been + * uncloned in the meantime). It doesn't matter which + * order we take the locks because no other cpu could + * be trying to lock both of these tasks. + */ + spin_lock(&ctx->lock); + spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + if (context_equiv(ctx, next_ctx)) { + /* + * XXX do we need a memory barrier of sorts + * wrt to rcu_dereference() of perf_event_ctxp + */ + task->perf_event_ctxp = next_ctx; + next->perf_event_ctxp = ctx; + ctx->task = next; + next_ctx->task = task; + do_switch = 0; + + perf_event_sync_stat(ctx, next_ctx); + } + spin_unlock(&next_ctx->lock); + spin_unlock(&ctx->lock); + } + rcu_read_unlock(); + + if (do_switch) { + __perf_event_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; + } +} + +/* + * Called with IRQs disabled + */ +static void __perf_event_task_sched_out(struct perf_event_context *ctx) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + __perf_event_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; +} + +/* + * Called with IRQs disabled + */ +static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) +{ + __perf_event_sched_out(&cpuctx->ctx, cpuctx); +} + +static void +__perf_event_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, int cpu) +{ + struct perf_event *event; + int can_add_hw = 1; + + spin_lock(&ctx->lock); + ctx->is_active = 1; + if (likely(!ctx->nr_events)) + goto out; + + ctx->timestamp = perf_clock(); + + perf_disable(); + + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. + */ + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF || + !event->attr.pinned) + continue; + if (event->cpu != -1 && event->cpu != cpu) + continue; + + if (event != event->group_leader) + event_sched_in(event, cpuctx, ctx, cpu); + else { + if (group_can_go_on(event, cpuctx, 1)) + group_sched_in(event, cpuctx, ctx, cpu); + } + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_ERROR; + } + } + + list_for_each_entry(event, &ctx->group_list, group_entry) { + /* + * Ignore events in OFF or ERROR state, and + * ignore pinned events since we did them already. + */ + if (event->state <= PERF_EVENT_STATE_OFF || + event->attr.pinned) + continue; + + /* + * Listen to the 'cpu' scheduling filter constraint + * of events: + */ + if (event->cpu != -1 && event->cpu != cpu) + continue; + + if (event != event->group_leader) { + if (event_sched_in(event, cpuctx, ctx, cpu)) + can_add_hw = 0; + } else { + if (group_can_go_on(event, cpuctx, can_add_hw)) { + if (group_sched_in(event, cpuctx, ctx, cpu)) + can_add_hw = 0; + } + } + } + perf_enable(); + out: + spin_unlock(&ctx->lock); +} + +/* + * Called from scheduler to add the events of the current task + * with interrupts disabled. + * + * We restore the event value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * keep the event running. + */ +void perf_event_task_sched_in(struct task_struct *task, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = task->perf_event_ctxp; + + if (likely(!ctx)) + return; + if (cpuctx->task_ctx == ctx) + return; + __perf_event_sched_in(ctx, cpuctx, cpu); + cpuctx->task_ctx = ctx; +} + +static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +{ + struct perf_event_context *ctx = &cpuctx->ctx; + + __perf_event_sched_in(ctx, cpuctx, cpu); +} + +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_event *event, int enable); + +static void perf_adjust_period(struct perf_event *event, u64 events) +{ + struct hw_perf_event *hwc = &event->hw; + u64 period, sample_period; + s64 delta; + + events *= hwc->sample_period; + period = div64_u64(events, event->attr.sample_freq); + + delta = (s64)(period - hwc->sample_period); + delta = (delta + 7) / 8; /* low pass filter */ + + sample_period = hwc->sample_period + delta; + + if (!sample_period) + sample_period = 1; + + hwc->sample_period = sample_period; +} + +static void perf_ctx_adjust_freq(struct perf_event_context *ctx) +{ + struct perf_event *event; + struct hw_perf_event *hwc; + u64 interrupts, freq; + + spin_lock(&ctx->lock); + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + continue; + + hwc = &event->hw; + + interrupts = hwc->interrupts; + hwc->interrupts = 0; + + /* + * unthrottle events on the tick + */ + if (interrupts == MAX_INTERRUPTS) { + perf_log_throttle(event, 1); + event->pmu->unthrottle(event); + interrupts = 2*sysctl_perf_event_sample_rate/HZ; + } + + if (!event->attr.freq || !event->attr.sample_freq) + continue; + + /* + * if the specified freq < HZ then we need to skip ticks + */ + if (event->attr.sample_freq < HZ) { + freq = event->attr.sample_freq; + + hwc->freq_count += freq; + hwc->freq_interrupts += interrupts; + + if (hwc->freq_count < HZ) + continue; + + interrupts = hwc->freq_interrupts; + hwc->freq_interrupts = 0; + hwc->freq_count -= HZ; + } else + freq = HZ; + + perf_adjust_period(event, freq * interrupts); + + /* + * In order to avoid being stalled by an (accidental) huge + * sample period, force reset the sample period if we didn't + * get any events in this freq period. + */ + if (!interrupts) { + perf_disable(); + event->pmu->disable(event); + atomic64_set(&hwc->period_left, 0); + event->pmu->enable(event); + perf_enable(); + } + } + spin_unlock(&ctx->lock); +} + +/* + * Round-robin a context's events: + */ +static void rotate_ctx(struct perf_event_context *ctx) +{ + struct perf_event *event; + + if (!ctx->nr_events) + return; + + spin_lock(&ctx->lock); + /* + * Rotate the first entry last (works just fine for group events too): + */ + perf_disable(); + list_for_each_entry(event, &ctx->group_list, group_entry) { + list_move_tail(&event->group_entry, &ctx->group_list); + break; + } + perf_enable(); + + spin_unlock(&ctx->lock); +} + +void perf_event_task_tick(struct task_struct *curr, int cpu) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + + if (!atomic_read(&nr_events)) + return; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = curr->perf_event_ctxp; + + perf_ctx_adjust_freq(&cpuctx->ctx); + if (ctx) + perf_ctx_adjust_freq(ctx); + + perf_event_cpu_sched_out(cpuctx); + if (ctx) + __perf_event_task_sched_out(ctx); + + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); + + perf_event_cpu_sched_in(cpuctx, cpu); + if (ctx) + perf_event_task_sched_in(curr, cpu); +} + +/* + * Enable all of a task's events that have been marked enable-on-exec. + * This expects task == current. + */ +static void perf_event_enable_on_exec(struct task_struct *task) +{ + struct perf_event_context *ctx; + struct perf_event *event; + unsigned long flags; + int enabled = 0; + + local_irq_save(flags); + ctx = task->perf_event_ctxp; + if (!ctx || !ctx->nr_events) + goto out; + + __perf_event_task_sched_out(ctx); + + spin_lock(&ctx->lock); + + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (!event->attr.enable_on_exec) + continue; + event->attr.enable_on_exec = 0; + if (event->state >= PERF_EVENT_STATE_INACTIVE) + continue; + __perf_event_mark_enabled(event, ctx); + enabled = 1; + } + + /* + * Unclone this context if we enabled any event. + */ + if (enabled) + unclone_ctx(ctx); + + spin_unlock(&ctx->lock); + + perf_event_task_sched_in(task, smp_processor_id()); + out: + local_irq_restore(flags); +} + +/* + * Cross CPU call to read the hardware event + */ +static void __perf_event_read(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + unsigned long flags; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. In that case + * event->count would have been updated to a recent sample + * when the event was scheduled out. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + local_irq_save(flags); + if (ctx->is_active) + update_context_time(ctx); + event->pmu->read(event); + update_event_times(event); + local_irq_restore(flags); +} + +static u64 perf_event_read(struct perf_event *event) +{ + /* + * If event is enabled and currently active on a CPU, update the + * value in the event structure: + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + smp_call_function_single(event->oncpu, + __perf_event_read, event, 1); + } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_event_times(event); + } + + return atomic64_read(&event->count); +} + +/* + * Initialize the perf_event context in a task_struct: + */ +static void +__perf_event_init_context(struct perf_event_context *ctx, + struct task_struct *task) +{ + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->group_list); + INIT_LIST_HEAD(&ctx->event_list); + atomic_set(&ctx->refcount, 1); + ctx->task = task; +} + +static struct perf_event_context *find_get_context(pid_t pid, int cpu) +{ + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx; + struct task_struct *task; + unsigned long flags; + int err; + + /* + * If cpu is not a wildcard then this is a percpu event: + */ + if (cpu != -1) { + /* Must be root to operate on a CPU event: */ + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EACCES); + + if (cpu < 0 || cpu > num_possible_cpus()) + return ERR_PTR(-EINVAL); + + /* + * We could be clever and allow to attach a event to an + * offline CPU and activate it when the CPU comes up, but + * that's for later. + */ + if (!cpu_isset(cpu, cpu_online_map)) + return ERR_PTR(-ENODEV); + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = &cpuctx->ctx; + get_ctx(ctx); + + return ctx; + } + + rcu_read_lock(); + if (!pid) + task = current; + else + task = find_task_by_vpid(pid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + if (!task) + return ERR_PTR(-ESRCH); + + /* + * Can't attach events to a dying task. + */ + err = -ESRCH; + if (task->flags & PF_EXITING) + goto errout; + + /* Reuse ptrace permission checks for now. */ + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto errout; + + retry: + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + unclone_ctx(ctx); + spin_unlock_irqrestore(&ctx->lock, flags); + } + + if (!ctx) { + ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + err = -ENOMEM; + if (!ctx) + goto errout; + __perf_event_init_context(ctx, task); + get_ctx(ctx); + if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { + /* + * We raced with some other task; use + * the context they set. + */ + kfree(ctx); + goto retry; + } + get_task_struct(task); + } + + put_task_struct(task); + return ctx; + + errout: + put_task_struct(task); + return ERR_PTR(err); +} + +static void free_event_rcu(struct rcu_head *head) +{ + struct perf_event *event; + + event = container_of(head, struct perf_event, rcu_head); + if (event->ns) + put_pid_ns(event->ns); + kfree(event); +} + +static void perf_pending_sync(struct perf_event *event); + +static void free_event(struct perf_event *event) +{ + perf_pending_sync(event); + + if (!event->parent) { + atomic_dec(&nr_events); + if (event->attr.mmap) + atomic_dec(&nr_mmap_events); + if (event->attr.comm) + atomic_dec(&nr_comm_events); + if (event->attr.task) + atomic_dec(&nr_task_events); + } + + if (event->output) { + fput(event->output->filp); + event->output = NULL; + } + + if (event->destroy) + event->destroy(event); + + put_ctx(event->ctx); + call_rcu(&event->rcu_head, free_event_rcu); +} + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_event *event = file->private_data; + struct perf_event_context *ctx = event->ctx; + + file->private_data = NULL; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_event_remove_from_context(event); + mutex_unlock(&ctx->mutex); + + mutex_lock(&event->owner->perf_event_mutex); + list_del_init(&event->owner_entry); + mutex_unlock(&event->owner->perf_event_mutex); + put_task_struct(event->owner); + + free_event(event); + + return 0; +} + +static int perf_event_read_size(struct perf_event *event) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_GROUP) { + nr += event->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + + return size; +} + +static u64 perf_event_read_value(struct perf_event *event) +{ + struct perf_event *child; + u64 total = 0; + + total += perf_event_read(event); + list_for_each_entry(child, &event->child_list, child_list) + total += perf_event_read(child); + + return total; +} + +static int perf_event_read_entry(struct perf_event *event, + u64 read_format, char __user *buf) +{ + int n = 0, count = 0; + u64 values[2]; + + values[n++] = perf_event_read_value(event); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; +} + +static int perf_event_read_group(struct perf_event *event, + u64 read_format, char __user *buf) +{ + struct perf_event *leader = event->group_leader, *sub; + int n = 0, size = 0, err = -EFAULT; + u64 values[3]; + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + return -EFAULT; + + err = perf_event_read_entry(leader, read_format, buf + size); + if (err < 0) + return err; + + size += err; + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + err = perf_event_read_entry(sub, read_format, + buf + size); + if (err < 0) + return err; + + size += err; + } + + return size; +} + +static int perf_event_read_one(struct perf_event *event, + u64 read_format, char __user *buf) +{ + u64 values[4]; + int n = 0; + + values[n++] = perf_event_read_value(event); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = event->total_time_running + + atomic64_read(&event->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + +/* + * Read the performance event - simple non blocking version for now + */ +static ssize_t +perf_read_hw(struct perf_event *event, char __user *buf, size_t count) +{ + u64 read_format = event->attr.read_format; + int ret; + + /* + * Return end-of-file for a read on a event that is in + * error state (i.e. because it was pinned but it couldn't be + * scheduled on to the CPU at some point). + */ + if (event->state == PERF_EVENT_STATE_ERROR) + return 0; + + if (count < perf_event_read_size(event)) + return -ENOSPC; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_event_read_group(event, read_format, buf); + else + ret = perf_event_read_one(event, read_format, buf); + mutex_unlock(&event->child_mutex); + + return ret; +} + +static ssize_t +perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct perf_event *event = file->private_data; + + return perf_read_hw(event, buf, count); +} + +static unsigned int perf_poll(struct file *file, poll_table *wait) +{ + struct perf_event *event = file->private_data; + struct perf_mmap_data *data; + unsigned int events = POLL_HUP; + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (data) + events = atomic_xchg(&data->poll, 0); + rcu_read_unlock(); + + poll_wait(file, &event->waitq, wait); + + return events; +} + +static void perf_event_reset(struct perf_event *event) +{ + (void)perf_event_read(event); + atomic64_set(&event->count, 0); + perf_event_update_userpage(event); +} + +/* + * Holding the top-level event's child_mutex means that any + * descendant process that has inherited this event will block + * in sync_child_event if it goes to exit, thus satisfying the + * task existence requirements of perf_event_enable/disable. + */ +static void perf_event_for_each_child(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event *child; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + func(event); + list_for_each_entry(child, &event->child_list, child_list) + func(child); + mutex_unlock(&event->child_mutex); +} + +static void perf_event_for_each(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_event *sibling; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + event = event->group_leader; + + perf_event_for_each_child(event, func); + func(event); + list_for_each_entry(sibling, &event->sibling_list, group_entry) + perf_event_for_each_child(event, func); + mutex_unlock(&ctx->mutex); +} + +static int perf_event_period(struct perf_event *event, u64 __user *arg) +{ + struct perf_event_context *ctx = event->ctx; + unsigned long size; + int ret = 0; + u64 value; + + if (!event->attr.sample_period) + return -EINVAL; + + size = copy_from_user(&value, arg, sizeof(value)); + if (size != sizeof(value)) + return -EFAULT; + + if (!value) + return -EINVAL; + + spin_lock_irq(&ctx->lock); + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) { + ret = -EINVAL; + goto unlock; + } + + event->attr.sample_freq = value; + } else { + event->attr.sample_period = value; + event->hw.sample_period = value; + } +unlock: + spin_unlock_irq(&ctx->lock); + + return ret; +} + +int perf_event_set_output(struct perf_event *event, int output_fd); + +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_event *event = file->private_data; + void (*func)(struct perf_event *); + u32 flags = arg; + + switch (cmd) { + case PERF_EVENT_IOC_ENABLE: + func = perf_event_enable; + break; + case PERF_EVENT_IOC_DISABLE: + func = perf_event_disable; + break; + case PERF_EVENT_IOC_RESET: + func = perf_event_reset; + break; + + case PERF_EVENT_IOC_REFRESH: + return perf_event_refresh(event, arg); + + case PERF_EVENT_IOC_PERIOD: + return perf_event_period(event, (u64 __user *)arg); + + case PERF_EVENT_IOC_SET_OUTPUT: + return perf_event_set_output(event, arg); + + default: + return -ENOTTY; + } + + if (flags & PERF_IOC_FLAG_GROUP) + perf_event_for_each(event, func); + else + perf_event_for_each_child(event, func); + + return 0; +} + +int perf_event_task_enable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_enable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +int perf_event_task_disable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_disable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +#ifndef PERF_EVENT_INDEX_OFFSET +# define PERF_EVENT_INDEX_OFFSET 0 +#endif + +static int perf_event_index(struct perf_event *event) +{ + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + + return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; +} + +/* + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. + */ +void perf_event_update_userpage(struct perf_event *event) +{ + struct perf_event_mmap_page *userpg; + struct perf_mmap_data *data; + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + userpg = data->user_page; + + /* + * Disable preemption so as to not let the corresponding user-space + * spin too long if we get preempted. + */ + preempt_disable(); + ++userpg->lock; + barrier(); + userpg->index = perf_event_index(event); + userpg->offset = atomic64_read(&event->count); + if (event->state == PERF_EVENT_STATE_ACTIVE) + userpg->offset -= atomic64_read(&event->hw.prev_count); + + userpg->time_enabled = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + + userpg->time_running = event->total_time_running + + atomic64_read(&event->child_total_time_running); + + barrier(); + ++userpg->lock; + preempt_enable(); +unlock: + rcu_read_unlock(); +} + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_event *event = vma->vm_file->private_data; + struct perf_mmap_data *data; + int ret = VM_FAULT_SIGBUS; + + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + if (vmf->pgoff == 0) { + vmf->page = virt_to_page(data->user_page); + } else { + int nr = vmf->pgoff - 1; + + if ((unsigned)nr > data->nr_pages) + goto unlock; + + if (vmf->flags & FAULT_FLAG_WRITE) + goto unlock; + + vmf->page = virt_to_page(data->data_pages[nr]); + } + + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +{ + struct perf_mmap_data *data; + unsigned long size; + int i; + + WARN_ON(atomic_read(&event->mmap_count)); + + size = sizeof(struct perf_mmap_data); + size += nr_pages * sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; + + data->user_page = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->user_page) + goto fail_user_page; + + for (i = 0; i < nr_pages; i++) { + data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->data_pages[i]) + goto fail_data_pages; + } + + data->nr_pages = nr_pages; + atomic_set(&data->lock, -1); + + if (event->attr.watermark) { + data->watermark = min_t(long, PAGE_SIZE * nr_pages, + event->attr.wakeup_watermark); + } + if (!data->watermark) + data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4); + + rcu_assign_pointer(event->data, data); + + return 0; + +fail_data_pages: + for (i--; i >= 0; i--) + free_page((unsigned long)data->data_pages[i]); + + free_page((unsigned long)data->user_page); + +fail_user_page: + kfree(data); + +fail: + return -ENOMEM; +} + +static void perf_mmap_free_page(unsigned long addr) +{ + struct page *page = virt_to_page((void *)addr); + + page->mapping = NULL; + __free_page(page); +} + +static void __perf_mmap_data_free(struct rcu_head *rcu_head) +{ + struct perf_mmap_data *data; + int i; + + data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + + perf_mmap_free_page((unsigned long)data->user_page); + for (i = 0; i < data->nr_pages; i++) + perf_mmap_free_page((unsigned long)data->data_pages[i]); + + kfree(data); +} + +static void perf_mmap_data_free(struct perf_event *event) +{ + struct perf_mmap_data *data = event->data; + + WARN_ON(atomic_read(&event->mmap_count)); + + rcu_assign_pointer(event->data, NULL); + call_rcu(&data->rcu_head, __perf_mmap_data_free); +} + +static void perf_mmap_open(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + atomic_inc(&event->mmap_count); +} + +static void perf_mmap_close(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + WARN_ON_ONCE(event->ctx->parent_ctx); + if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { + struct user_struct *user = current_user(); + + atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm); + vma->vm_mm->locked_vm -= event->data->nr_locked; + perf_mmap_data_free(event); + mutex_unlock(&event->mmap_mutex); + } +} + +static struct vm_operations_struct perf_mmap_vmops = { + .open = perf_mmap_open, + .close = perf_mmap_close, + .fault = perf_mmap_fault, + .page_mkwrite = perf_mmap_fault, +}; + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_event *event = file->private_data; + unsigned long user_locked, user_lock_limit; + struct user_struct *user = current_user(); + unsigned long locked, lock_limit; + unsigned long vma_size; + unsigned long nr_pages; + long user_extra, extra; + int ret = 0; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma_size = vma->vm_end - vma->vm_start; + nr_pages = (vma_size / PAGE_SIZE) - 1; + + /* + * If we have data pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; + + if (vma_size != PAGE_SIZE * (1 + nr_pages)) + return -EINVAL; + + if (vma->vm_pgoff != 0) + return -EINVAL; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->mmap_mutex); + if (event->output) { + ret = -EINVAL; + goto unlock; + } + + if (atomic_inc_not_zero(&event->mmap_count)) { + if (nr_pages != event->data->nr_pages) + ret = -EINVAL; + goto unlock; + } + + user_extra = nr_pages + 1; + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + + /* + * Increase the limit linearly with more CPUs: + */ + user_lock_limit *= num_online_cpus(); + + user_locked = atomic_long_read(&user->locked_vm) + user_extra; + + extra = 0; + if (user_locked > user_lock_limit) + extra = user_locked - user_lock_limit; + + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + locked = vma->vm_mm->locked_vm + extra; + + if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && + !capable(CAP_IPC_LOCK)) { + ret = -EPERM; + goto unlock; + } + + WARN_ON(event->data); + ret = perf_mmap_data_alloc(event, nr_pages); + if (ret) + goto unlock; + + atomic_set(&event->mmap_count, 1); + atomic_long_add(user_extra, &user->locked_vm); + vma->vm_mm->locked_vm += extra; + event->data->nr_locked = extra; + if (vma->vm_flags & VM_WRITE) + event->data->writable = 1; + +unlock: + mutex_unlock(&event->mmap_mutex); + + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &perf_mmap_vmops; + + return ret; +} + +static int perf_fasync(int fd, struct file *filp, int on) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct perf_event *event = filp->private_data; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &event->fasync); + mutex_unlock(&inode->i_mutex); + + if (retval < 0) + return retval; + + return 0; +} + +static const struct file_operations perf_fops = { + .release = perf_release, + .read = perf_read, + .poll = perf_poll, + .unlocked_ioctl = perf_ioctl, + .compat_ioctl = perf_ioctl, + .mmap = perf_mmap, + .fasync = perf_fasync, +}; + +/* + * Perf event wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_event_wakeup(struct perf_event *event) +{ + wake_up_all(&event->waitq); + + if (event->pending_kill) { + kill_fasync(&event->fasync, SIGIO, event->pending_kill); + event->pending_kill = 0; + } +} + +/* + * Pending wakeups + * + * Handle the case where we need to wakeup up from NMI (or rq->lock) context. + * + * The NMI bit means we cannot possibly take locks. Therefore, maintain a + * single linked list and use cmpxchg() to add entries lockless. + */ + +static void perf_pending_event(struct perf_pending_entry *entry) +{ + struct perf_event *event = container_of(entry, + struct perf_event, pending); + + if (event->pending_disable) { + event->pending_disable = 0; + __perf_event_disable(event); + } + + if (event->pending_wakeup) { + event->pending_wakeup = 0; + perf_event_wakeup(event); + } +} + +#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) + +static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { + PENDING_TAIL, +}; + +static void perf_pending_queue(struct perf_pending_entry *entry, + void (*func)(struct perf_pending_entry *)) +{ + struct perf_pending_entry **head; + + if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) + return; + + entry->func = func; + + head = &get_cpu_var(perf_pending_head); + + do { + entry->next = *head; + } while (cmpxchg(head, entry->next, entry) != entry->next); + + set_perf_event_pending(); + + put_cpu_var(perf_pending_head); +} + +static int __perf_pending_run(void) +{ + struct perf_pending_entry *list; + int nr = 0; + + list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); + while (list != PENDING_TAIL) { + void (*func)(struct perf_pending_entry *); + struct perf_pending_entry *entry = list; + + list = list->next; + + func = entry->func; + entry->next = NULL; + /* + * Ensure we observe the unqueue before we issue the wakeup, + * so that we won't be waiting forever. + * -- see perf_not_pending(). + */ + smp_wmb(); + + func(entry); + nr++; + } + + return nr; +} + +static inline int perf_not_pending(struct perf_event *event) +{ + /* + * If we flush on whatever cpu we run, there is a chance we don't + * need to wait. + */ + get_cpu(); + __perf_pending_run(); + put_cpu(); + + /* + * Ensure we see the proper queue state before going to sleep + * so that we do not miss the wakeup. -- see perf_pending_handle() + */ + smp_rmb(); + return event->pending.next == NULL; +} + +static void perf_pending_sync(struct perf_event *event) +{ + wait_event(event->waitq, perf_not_pending(event)); +} + +void perf_event_do_pending(void) +{ + __perf_pending_run(); +} + +/* + * Callchain support -- arch specific + */ + +__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + return NULL; +} + +/* + * Output + */ +static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, + unsigned long offset, unsigned long head) +{ + unsigned long mask; + + if (!data->writable) + return true; + + mask = (data->nr_pages << PAGE_SHIFT) - 1; + + offset = (offset - tail) & mask; + head = (head - tail) & mask; + + if ((int)(head - offset) < 0) + return false; + + return true; +} + +static void perf_output_wakeup(struct perf_output_handle *handle) +{ + atomic_set(&handle->data->poll, POLL_IN); + + if (handle->nmi) { + handle->event->pending_wakeup = 1; + perf_pending_queue(&handle->event->pending, + perf_pending_event); + } else + perf_event_wakeup(handle->event); +} + +/* + * Curious locking construct. + * + * We need to ensure a later event_id doesn't publish a head when a former + * event_id isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * What we do is serialize between CPUs so we only have to deal with NMI + * nesting on a single CPU. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event_id completes. + */ +static void perf_output_lock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int cpu; + + handle->locked = 0; + + local_irq_save(handle->flags); + cpu = smp_processor_id(); + + if (in_nmi() && atomic_read(&data->lock) == cpu) + return; + + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) + cpu_relax(); + + handle->locked = 1; +} + +static void perf_output_unlock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + unsigned long head; + int cpu; + + data->done_head = data->head; + + if (!handle->locked) + goto out; + +again: + /* + * The xchg implies a full barrier that ensures all writes are done + * before we publish the new head, matched by a rmb() in userspace when + * reading this position. + */ + while ((head = atomic_long_xchg(&data->done_head, 0))) + data->user_page->data_head = head; + + /* + * NMI can happen here, which means we can miss a done_head update. + */ + + cpu = atomic_xchg(&data->lock, -1); + WARN_ON_ONCE(cpu != smp_processor_id()); + + /* + * Therefore we have to validate we did not indeed do so. + */ + if (unlikely(atomic_long_read(&data->done_head))) { + /* + * Since we had it locked, we can lock it again. + */ + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) + cpu_relax(); + + goto again; + } + + if (atomic_xchg(&data->wakeup, 0)) + perf_output_wakeup(handle); +out: + local_irq_restore(handle->flags); +} + +void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + unsigned int pages_mask; + unsigned int offset; + unsigned int size; + void **pages; + + offset = handle->offset; + pages_mask = handle->data->nr_pages - 1; + pages = handle->data->data_pages; + + do { + unsigned int page_offset; + int nr; + + nr = (offset >> PAGE_SHIFT) & pages_mask; + page_offset = offset & (PAGE_SIZE - 1); + size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + + memcpy(pages[nr] + page_offset, buf, size); + + len -= size; + buf += size; + offset += size; + } while (len); + + handle->offset = offset; + + /* + * Check we didn't copy past our reservation window, taking the + * possible unsigned int wrap into account. + */ + WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); +} + +int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + int nmi, int sample) +{ + struct perf_event *output_event; + struct perf_mmap_data *data; + unsigned long tail, offset, head; + int have_lost; + struct { + struct perf_event_header header; + u64 id; + u64 lost; + } lost_event; + + rcu_read_lock(); + /* + * For inherited events we send all the output towards the parent. + */ + if (event->parent) + event = event->parent; + + output_event = rcu_dereference(event->output); + if (output_event) + event = output_event; + + data = rcu_dereference(event->data); + if (!data) + goto out; + + handle->data = data; + handle->event = event; + handle->nmi = nmi; + handle->sample = sample; + + if (!data->nr_pages) + goto fail; + + have_lost = atomic_read(&data->lost); + if (have_lost) + size += sizeof(lost_event); + + perf_output_lock(handle); + + do { + /* + * Userspace could choose to issue a mb() before updating the + * tail pointer. So that all reads will be completed before the + * write is issued. + */ + tail = ACCESS_ONCE(data->user_page->data_tail); + smp_rmb(); + offset = head = atomic_long_read(&data->head); + head += size; + if (unlikely(!perf_output_space(data, tail, offset, head))) + goto fail; + } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); + + handle->offset = offset; + handle->head = head; + + if (head - tail > data->watermark) + atomic_set(&data->wakeup, 1); + + if (have_lost) { + lost_event.header.type = PERF_RECORD_LOST; + lost_event.header.misc = 0; + lost_event.header.size = sizeof(lost_event); + lost_event.id = event->id; + lost_event.lost = atomic_xchg(&data->lost, 0); + + perf_output_put(handle, lost_event); + } + + return 0; + +fail: + atomic_inc(&data->lost); + perf_output_unlock(handle); +out: + rcu_read_unlock(); + + return -ENOSPC; +} + +void perf_output_end(struct perf_output_handle *handle) +{ + struct perf_event *event = handle->event; + struct perf_mmap_data *data = handle->data; + + int wakeup_events = event->attr.wakeup_events; + + if (handle->sample && wakeup_events) { + int events = atomic_inc_return(&data->events); + if (events >= wakeup_events) { + atomic_sub(wakeup_events, &data->events); + atomic_set(&data->wakeup, 1); + } + } + + perf_output_unlock(handle); + rcu_read_unlock(); +} + +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_tgid_nr_ns(p, event->ns); +} + +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_pid_nr_ns(p, event->ns); +} + +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_event *event) +{ + u64 read_format = event->attr.read_format; + u64 values[4]; + int n = 0; + + values[n++] = atomic64_read(&event->count); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = event->total_time_running + + atomic64_read(&event->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_event *event) +{ + struct perf_event *leader = event->group_leader, *sub; + u64 read_format = event->attr.read_format; + u64 values[5]; + int n = 0; + + values[n++] = 1 + leader->nr_siblings; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = leader->total_time_enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = leader->total_time_running; + + if (leader != event) + leader->pmu->read(leader); + + values[n++] = atomic64_read(&leader->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + n = 0; + + if (sub != event) + sub->pmu->read(sub); + + values[n++] = atomic64_read(&sub->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); + } +} + +static void perf_output_read(struct perf_output_handle *handle, + struct perf_event *event) +{ + if (event->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, event); + else + perf_output_read_one(handle, event); +} + +void perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + u64 sample_type = data->type; + + perf_output_put(handle, *header); + + if (sample_type & PERF_SAMPLE_IP) + perf_output_put(handle, data->ip); + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ADDR) + perf_output_put(handle, data->addr); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); + + if (sample_type & PERF_SAMPLE_PERIOD) + perf_output_put(handle, data->period); + + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(handle, event); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + if (data->callchain) { + int size = 1; + + if (data->callchain) + size += data->callchain->nr; + + size *= sizeof(u64); + + perf_output_copy(handle, data->callchain, size); + } else { + u64 nr = 0; + perf_output_put(handle, nr); + } + } + + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(handle, data->raw->size); + perf_output_copy(handle, data->raw->data, + data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(handle, raw); + } + } +} + +void perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs) +{ + u64 sample_type = event->attr.sample_type; + + data->type = sample_type; + + header->type = PERF_RECORD_SAMPLE; + header->size = sizeof(*header); + + header->misc = 0; + header->misc |= perf_misc_flags(regs); + + if (sample_type & PERF_SAMPLE_IP) { + data->ip = perf_instruction_pointer(regs); + + header->size += sizeof(data->ip); + } + + if (sample_type & PERF_SAMPLE_TID) { + /* namespace issues */ + data->tid_entry.pid = perf_event_pid(event, current); + data->tid_entry.tid = perf_event_tid(event, current); + + header->size += sizeof(data->tid_entry); + } + + if (sample_type & PERF_SAMPLE_TIME) { + data->time = perf_clock(); + + header->size += sizeof(data->time); + } + + if (sample_type & PERF_SAMPLE_ADDR) + header->size += sizeof(data->addr); + + if (sample_type & PERF_SAMPLE_ID) { + data->id = primary_event_id(event); + + header->size += sizeof(data->id); + } + + if (sample_type & PERF_SAMPLE_STREAM_ID) { + data->stream_id = event->id; + + header->size += sizeof(data->stream_id); + } + + if (sample_type & PERF_SAMPLE_CPU) { + data->cpu_entry.cpu = raw_smp_processor_id(); + data->cpu_entry.reserved = 0; + + header->size += sizeof(data->cpu_entry); + } + + if (sample_type & PERF_SAMPLE_PERIOD) + header->size += sizeof(data->period); + + if (sample_type & PERF_SAMPLE_READ) + header->size += perf_event_read_size(event); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + int size = 1; + + data->callchain = perf_callchain(regs); + + if (data->callchain) + size += data->callchain->nr; + + header->size += size * sizeof(u64); + } + + if (sample_type & PERF_SAMPLE_RAW) { + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header->size += size; + } +} + +static void perf_event_output(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_output_handle handle; + struct perf_event_header header; + + perf_prepare_sample(&header, data, event, regs); + + if (perf_output_begin(&handle, event, header.size, nmi, 1)) + return; + + perf_output_sample(&handle, &header, data, event); + + perf_output_end(&handle); +} + +/* + * read event_id + */ + +struct perf_read_event { + struct perf_event_header header; + + u32 pid; + u32 tid; +}; + +static void +perf_event_read_event(struct perf_event *event, + struct task_struct *task) +{ + struct perf_output_handle handle; + struct perf_read_event read_event = { + .header = { + .type = PERF_RECORD_READ, + .misc = 0, + .size = sizeof(read_event) + perf_event_read_size(event), + }, + .pid = perf_event_pid(event, task), + .tid = perf_event_tid(event, task), + }; + int ret; + + ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); + if (ret) + return; + + perf_output_put(&handle, read_event); + perf_output_read(&handle, event); + + perf_output_end(&handle); +} + +/* + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.task + */ + +struct perf_task_event { + struct task_struct *task; + struct perf_event_context *task_ctx; + + struct { + struct perf_event_header header; + + u32 pid; + u32 ppid; + u32 tid; + u32 ptid; + u64 time; + } event_id; +}; + +static void perf_event_task_output(struct perf_event *event, + struct perf_task_event *task_event) +{ + struct perf_output_handle handle; + int size; + struct task_struct *task = task_event->task; + int ret; + + size = task_event->event_id.header.size; + ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + task_event->event_id.pid = perf_event_pid(event, task); + task_event->event_id.ppid = perf_event_pid(event, current); + + task_event->event_id.tid = perf_event_tid(event, task); + task_event->event_id.ptid = perf_event_tid(event, current); + + task_event->event_id.time = perf_clock(); + + perf_output_put(&handle, task_event->event_id); + + perf_output_end(&handle); +} + +static int perf_event_task_match(struct perf_event *event) +{ + if (event->attr.comm || event->attr.mmap || event->attr.task) + return 1; + + return 0; +} + +static void perf_event_task_ctx(struct perf_event_context *ctx, + struct perf_task_event *task_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_task_match(event)) + perf_event_task_output(event, task_event); + } + rcu_read_unlock(); +} + +static void perf_event_task_event(struct perf_task_event *task_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx = task_event->task_ctx; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_task_ctx(&cpuctx->ctx, task_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + if (!ctx) + ctx = rcu_dereference(task_event->task->perf_event_ctxp); + if (ctx) + perf_event_task_ctx(ctx, task_event); + rcu_read_unlock(); +} + +static void perf_event_task(struct task_struct *task, + struct perf_event_context *task_ctx, + int new) +{ + struct perf_task_event task_event; + + if (!atomic_read(&nr_comm_events) && + !atomic_read(&nr_mmap_events) && + !atomic_read(&nr_task_events)) + return; + + task_event = (struct perf_task_event){ + .task = task, + .task_ctx = task_ctx, + .event_id = { + .header = { + .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, + .misc = 0, + .size = sizeof(task_event.event_id), + }, + /* .pid */ + /* .ppid */ + /* .tid */ + /* .ptid */ + }, + }; + + perf_event_task_event(&task_event); +} + +void perf_event_fork(struct task_struct *task) +{ + perf_event_task(task, NULL, 1); +} + +/* + * comm tracking + */ + +struct perf_comm_event { + struct task_struct *task; + char *comm; + int comm_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + } event_id; +}; + +static void perf_event_comm_output(struct perf_event *event, + struct perf_comm_event *comm_event) +{ + struct perf_output_handle handle; + int size = comm_event->event_id.header.size; + int ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + comm_event->event_id.pid = perf_event_pid(event, comm_event->task); + comm_event->event_id.tid = perf_event_tid(event, comm_event->task); + + perf_output_put(&handle, comm_event->event_id); + perf_output_copy(&handle, comm_event->comm, + comm_event->comm_size); + perf_output_end(&handle); +} + +static int perf_event_comm_match(struct perf_event *event) +{ + if (event->attr.comm) + return 1; + + return 0; +} + +static void perf_event_comm_ctx(struct perf_event_context *ctx, + struct perf_comm_event *comm_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_comm_match(event)) + perf_event_comm_output(event, comm_event); + } + rcu_read_unlock(); +} + +static void perf_event_comm_event(struct perf_comm_event *comm_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + unsigned int size; + char comm[TASK_COMM_LEN]; + + memset(comm, 0, sizeof(comm)); + strncpy(comm, comm_event->task->comm, sizeof(comm)); + size = ALIGN(strlen(comm)+1, sizeof(u64)); + + comm_event->comm = comm; + comm_event->comm_size = size; + + comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_comm_ctx(&cpuctx->ctx, comm_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_event_comm_ctx(ctx, comm_event); + rcu_read_unlock(); +} + +void perf_event_comm(struct task_struct *task) +{ + struct perf_comm_event comm_event; + + if (task->perf_event_ctxp) + perf_event_enable_on_exec(task); + + if (!atomic_read(&nr_comm_events)) + return; + + comm_event = (struct perf_comm_event){ + .task = task, + /* .comm */ + /* .comm_size */ + .event_id = { + .header = { + .type = PERF_RECORD_COMM, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ + }, + }; + + perf_event_comm_event(&comm_event); +} + +/* + * mmap tracking + */ + +struct perf_mmap_event { + struct vm_area_struct *vma; + + const char *file_name; + int file_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 start; + u64 len; + u64 pgoff; + } event_id; +}; + +static void perf_event_mmap_output(struct perf_event *event, + struct perf_mmap_event *mmap_event) +{ + struct perf_output_handle handle; + int size = mmap_event->event_id.header.size; + int ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + mmap_event->event_id.pid = perf_event_pid(event, current); + mmap_event->event_id.tid = perf_event_tid(event, current); + + perf_output_put(&handle, mmap_event->event_id); + perf_output_copy(&handle, mmap_event->file_name, + mmap_event->file_size); + perf_output_end(&handle); +} + +static int perf_event_mmap_match(struct perf_event *event, + struct perf_mmap_event *mmap_event) +{ + if (event->attr.mmap) + return 1; + + return 0; +} + +static void perf_event_mmap_ctx(struct perf_event_context *ctx, + struct perf_mmap_event *mmap_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_mmap_match(event, mmap_event)) + perf_event_mmap_output(event, mmap_event); + } + rcu_read_unlock(); +} + +static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct vm_area_struct *vma = mmap_event->vma; + struct file *file = vma->vm_file; + unsigned int size; + char tmp[16]; + char *buf = NULL; + const char *name; + + memset(tmp, 0, sizeof(tmp)); + + if (file) { + /* + * d_path works from the end of the buffer backwards, so we + * need to add enough zero bytes after the string to handle + * the 64bit alignment we do later. + */ + buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); + if (!buf) { + name = strncpy(tmp, "//enomem", sizeof(tmp)); + goto got_name; + } + name = d_path(&file->f_path, buf, PATH_MAX); + if (IS_ERR(name)) { + name = strncpy(tmp, "//toolong", sizeof(tmp)); + goto got_name; + } + } else { + if (arch_vma_name(mmap_event->vma)) { + name = strncpy(tmp, arch_vma_name(mmap_event->vma), + sizeof(tmp)); + goto got_name; + } + + if (!vma->vm_mm) { + name = strncpy(tmp, "[vdso]", sizeof(tmp)); + goto got_name; + } + + name = strncpy(tmp, "//anon", sizeof(tmp)); + goto got_name; + } + +got_name: + size = ALIGN(strlen(name)+1, sizeof(u64)); + + mmap_event->file_name = name; + mmap_event->file_size = size; + + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_event_mmap_ctx(ctx, mmap_event); + rcu_read_unlock(); + + kfree(buf); +} + +void __perf_event_mmap(struct vm_area_struct *vma) +{ + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_mmap_events)) + return; + + mmap_event = (struct perf_mmap_event){ + .vma = vma, + /* .file_name */ + /* .file_size */ + .event_id = { + .header = { + .type = PERF_RECORD_MMAP, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ + .start = vma->vm_start, + .len = vma->vm_end - vma->vm_start, + .pgoff = vma->vm_pgoff, + }, + }; + + perf_event_mmap_event(&mmap_event); +} + +/* + * IRQ throttle logging + */ + +static void perf_log_throttle(struct perf_event *event, int enable) +{ + struct perf_output_handle handle; + int ret; + + struct { + struct perf_event_header header; + u64 time; + u64 id; + u64 stream_id; + } throttle_event = { + .header = { + .type = PERF_RECORD_THROTTLE, + .misc = 0, + .size = sizeof(throttle_event), + }, + .time = perf_clock(), + .id = primary_event_id(event), + .stream_id = event->id, + }; + + if (enable) + throttle_event.header.type = PERF_RECORD_UNTHROTTLE; + + ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); + if (ret) + return; + + perf_output_put(&handle, throttle_event); + perf_output_end(&handle); +} + +/* + * Generic event overflow handling, sampling. + */ + +static int __perf_event_overflow(struct perf_event *event, int nmi, + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) +{ + int events = atomic_read(&event->event_limit); + struct hw_perf_event *hwc = &event->hw; + int ret = 0; + + throttle = (throttle && event->pmu->unthrottle != NULL); + + if (!throttle) { + hwc->interrupts++; + } else { + if (hwc->interrupts != MAX_INTERRUPTS) { + hwc->interrupts++; + if (HZ * hwc->interrupts > + (u64)sysctl_perf_event_sample_rate) { + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(event, 0); + ret = 1; + } + } else { + /* + * Keep re-disabling events even though on the previous + * pass we disabled it - just in case we raced with a + * sched-in and the event got enabled again: + */ + ret = 1; + } + } + + if (event->attr.freq) { + u64 now = perf_clock(); + s64 delta = now - hwc->freq_stamp; + + hwc->freq_stamp = now; + + if (delta > 0 && delta < TICK_NSEC) + perf_adjust_period(event, NSEC_PER_SEC / (int)delta); + } + + /* + * XXX event_limit might not quite work as expected on inherited + * events + */ + + event->pending_kill = POLL_IN; + if (events && atomic_dec_and_test(&event->event_limit)) { + ret = 1; + event->pending_kill = POLL_HUP; + if (nmi) { + event->pending_disable = 1; + perf_pending_queue(&event->pending, + perf_pending_event); + } else + perf_event_disable(event); + } + + perf_event_output(event, nmi, data, regs); + return ret; +} + +int perf_event_overflow(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + return __perf_event_overflow(event, nmi, 1, data, regs); +} + +/* + * Generic software event infrastructure + */ + +/* + * We directly increment event->count and keep a second value in + * event->hw.period_left to count intervals. This period event + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swevent_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; + +again: + old = val = atomic64_read(&hwc->period_left); + if (val < 0) + return 0; + + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + goto again; + + return nr; +} + +static void perf_swevent_overflow(struct perf_event *event, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + int throttle = 0; + u64 overflow; + + data->period = event->hw.last_period; + overflow = perf_swevent_set_period(event); + + if (hwc->interrupts == MAX_INTERRUPTS) + return; + + for (; overflow; overflow--) { + if (__perf_event_overflow(event, nmi, throttle, + data, regs)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + throttle = 1; + } +} + +static void perf_swevent_unthrottle(struct perf_event *event) +{ + /* + * Nothing to do, we already reset hwc->interrupts. + */ +} + +static void perf_swevent_add(struct perf_event *event, u64 nr, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + + atomic64_add(nr, &event->count); + + if (!hwc->sample_period) + return; + + if (!regs) + return; + + if (!atomic64_add_negative(nr, &hwc->period_left)) + perf_swevent_overflow(event, nmi, data, regs); +} + +static int perf_swevent_is_counting(struct perf_event *event) +{ + /* + * The event is active, we're good! + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) + return 1; + + /* + * The event is off/error, not counting. + */ + if (event->state != PERF_EVENT_STATE_INACTIVE) + return 0; + + /* + * The event is inactive, if the context is active + * we're part of a group that didn't make it on the 'pmu', + * not counting. + */ + if (event->ctx->is_active) + return 0; + + /* + * We're inactive and the context is too, this means the + * task is scheduled out, we're counting events that happen + * to us, like migration events. + */ + return 1; +} + +static int perf_swevent_match(struct perf_event *event, + enum perf_type_id type, + u32 event_id, struct pt_regs *regs) +{ + if (!perf_swevent_is_counting(event)) + return 0; + + if (event->attr.type != type) + return 0; + if (event->attr.config != event_id) + return 0; + + if (regs) { + if (event->attr.exclude_user && user_mode(regs)) + return 0; + + if (event->attr.exclude_kernel && !user_mode(regs)) + return 0; + } + + return 1; +} + +static void perf_swevent_ctx_event(struct perf_event_context *ctx, + enum perf_type_id type, + u32 event_id, u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_swevent_match(event, type, event_id, regs)) + perf_swevent_add(event, nr, nmi, data, regs); + } + rcu_read_unlock(); +} + +static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) +{ + if (in_nmi()) + return &cpuctx->recursion[3]; + + if (in_irq()) + return &cpuctx->recursion[2]; + + if (in_softirq()) + return &cpuctx->recursion[1]; + + return &cpuctx->recursion[0]; +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + int *recursion = perf_swevent_recursion_context(cpuctx); + struct perf_event_context *ctx; + + if (*recursion) + goto out; + + (*recursion)++; + barrier(); + + perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, + nr, nmi, data, regs); + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); + rcu_read_unlock(); + + barrier(); + (*recursion)--; + +out: + put_cpu_var(perf_cpu_context); +} + +void __perf_sw_event(u32 event_id, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) +{ + struct perf_sample_data data = { + .addr = addr, + }; + + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, + &data, regs); +} + +static void perf_swevent_read(struct perf_event *event) +{ +} + +static int perf_swevent_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->sample_period) { + hwc->last_period = hwc->sample_period; + perf_swevent_set_period(event); + } + return 0; +} + +static void perf_swevent_disable(struct perf_event *event) +{ +} + +static const struct pmu perf_ops_generic = { + .enable = perf_swevent_enable, + .disable = perf_swevent_disable, + .read = perf_swevent_read, + .unthrottle = perf_swevent_unthrottle, +}; + +/* + * hrtimer based swevent callback + */ + +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct pt_regs *regs; + struct perf_event *event; + u64 period; + + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + event->pmu->read(event); + + data.addr = 0; + regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((event->attr.exclude_kernel || !regs) && + !event->attr.exclude_user) + regs = task_pt_regs(current); + + if (regs) { + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, event->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + +/* + * Software event: cpu wall time clock + */ + +static void cpu_clock_perf_event_update(struct perf_event *event) +{ + int cpu = raw_smp_processor_id(); + s64 prev; + u64 now; + + now = cpu_clock(cpu); + prev = atomic64_read(&event->hw.prev_count); + atomic64_set(&event->hw.prev_count, now); + atomic64_add(now - prev, &event->count); +} + +static int cpu_clock_perf_event_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int cpu = raw_smp_processor_id(); + + atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + +static void cpu_clock_perf_event_disable(struct perf_event *event) +{ + if (event->hw.sample_period) + hrtimer_cancel(&event->hw.hrtimer); + cpu_clock_perf_event_update(event); +} + +static void cpu_clock_perf_event_read(struct perf_event *event) +{ + cpu_clock_perf_event_update(event); +} + +static const struct pmu perf_ops_cpu_clock = { + .enable = cpu_clock_perf_event_enable, + .disable = cpu_clock_perf_event_disable, + .read = cpu_clock_perf_event_read, +}; + +/* + * Software event: task time clock + */ + +static void task_clock_perf_event_update(struct perf_event *event, u64 now) +{ + u64 prev; + s64 delta; + + prev = atomic64_xchg(&event->hw.prev_count, now); + delta = now - prev; + atomic64_add(delta, &event->count); +} + +static int task_clock_perf_event_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 now; + + now = event->ctx->time; + + atomic64_set(&hwc->prev_count, now); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + +static void task_clock_perf_event_disable(struct perf_event *event) +{ + if (event->hw.sample_period) + hrtimer_cancel(&event->hw.hrtimer); + task_clock_perf_event_update(event, event->ctx->time); + +} + +static void task_clock_perf_event_read(struct perf_event *event) +{ + u64 time; + + if (!in_nmi()) { + update_context_time(event->ctx); + time = event->ctx->time; + } else { + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + time = event->ctx->time + delta; + } + + task_clock_perf_event_update(event, time); +} + +static const struct pmu perf_ops_task_clock = { + .enable = task_clock_perf_event_enable, + .disable = task_clock_perf_event_disable, + .read = task_clock_perf_event_read, +}; + +#ifdef CONFIG_EVENT_PROFILE +void perf_tp_event(int event_id, u64 addr, u64 count, void *record, + int entry_size) +{ + struct perf_raw_record raw = { + .size = entry_size, + .data = record, + }; + + struct perf_sample_data data = { + .addr = addr, + .raw = &raw, + }; + + struct pt_regs *regs = get_irq_regs(); + + if (!regs) + regs = task_pt_regs(current); + + do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, + &data, regs); +} +EXPORT_SYMBOL_GPL(perf_tp_event); + +extern int ftrace_profile_enable(int); +extern void ftrace_profile_disable(int); + +static void tp_perf_event_destroy(struct perf_event *event) +{ + ftrace_profile_disable(event->attr.config); +} + +static const struct pmu *tp_perf_event_init(struct perf_event *event) +{ + /* + * Raw tracepoint data is a severe data leak, only allow root to + * have these. + */ + if ((event->attr.sample_type & PERF_SAMPLE_RAW) && + perf_paranoid_tracepoint_raw() && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if (ftrace_profile_enable(event->attr.config)) + return NULL; + + event->destroy = tp_perf_event_destroy; + + return &perf_ops_generic; +} +#else +static const struct pmu *tp_perf_event_init(struct perf_event *event) +{ + return NULL; +} +#endif + +atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; + +static void sw_perf_event_destroy(struct perf_event *event) +{ + u64 event_id = event->attr.config; + + WARN_ON(event->parent); + + atomic_dec(&perf_swevent_enabled[event_id]); +} + +static const struct pmu *sw_perf_event_init(struct perf_event *event) +{ + const struct pmu *pmu = NULL; + u64 event_id = event->attr.config; + + /* + * Software events (currently) can't in general distinguish + * between user, kernel and hypervisor events. + * However, context switches and cpu migrations are considered + * to be kernel events, and page faults are never hypervisor + * events. + */ + switch (event_id) { + case PERF_COUNT_SW_CPU_CLOCK: + pmu = &perf_ops_cpu_clock; + + break; + case PERF_COUNT_SW_TASK_CLOCK: + /* + * If the user instantiates this as a per-cpu event, + * use the cpu_clock event instead. + */ + if (event->ctx->task) + pmu = &perf_ops_task_clock; + else + pmu = &perf_ops_cpu_clock; + + break; + case PERF_COUNT_SW_PAGE_FAULTS: + case PERF_COUNT_SW_PAGE_FAULTS_MIN: + case PERF_COUNT_SW_PAGE_FAULTS_MAJ: + case PERF_COUNT_SW_CONTEXT_SWITCHES: + case PERF_COUNT_SW_CPU_MIGRATIONS: + if (!event->parent) { + atomic_inc(&perf_swevent_enabled[event_id]); + event->destroy = sw_perf_event_destroy; + } + pmu = &perf_ops_generic; + break; + } + + return pmu; +} + +/* + * Allocate and initialize a event structure + */ +static struct perf_event * +perf_event_alloc(struct perf_event_attr *attr, + int cpu, + struct perf_event_context *ctx, + struct perf_event *group_leader, + struct perf_event *parent_event, + gfp_t gfpflags) +{ + const struct pmu *pmu; + struct perf_event *event; + struct hw_perf_event *hwc; + long err; + + event = kzalloc(sizeof(*event), gfpflags); + if (!event) + return ERR_PTR(-ENOMEM); + + /* + * Single events are their own group leaders, with an + * empty sibling list: + */ + if (!group_leader) + group_leader = event; + + mutex_init(&event->child_mutex); + INIT_LIST_HEAD(&event->child_list); + + INIT_LIST_HEAD(&event->group_entry); + INIT_LIST_HEAD(&event->event_entry); + INIT_LIST_HEAD(&event->sibling_list); + init_waitqueue_head(&event->waitq); + + mutex_init(&event->mmap_mutex); + + event->cpu = cpu; + event->attr = *attr; + event->group_leader = group_leader; + event->pmu = NULL; + event->ctx = ctx; + event->oncpu = -1; + + event->parent = parent_event; + + event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->id = atomic64_inc_return(&perf_event_id); + + event->state = PERF_EVENT_STATE_INACTIVE; + + if (attr->disabled) + event->state = PERF_EVENT_STATE_OFF; + + pmu = NULL; + + hwc = &event->hw; + hwc->sample_period = attr->sample_period; + if (attr->freq && attr->sample_freq) + hwc->sample_period = 1; + hwc->last_period = hwc->sample_period; + + atomic64_set(&hwc->period_left, hwc->sample_period); + + /* + * we currently do not support PERF_FORMAT_GROUP on inherited events + */ + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) + goto done; + + switch (attr->type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + pmu = hw_perf_event_init(event); + break; + + case PERF_TYPE_SOFTWARE: + pmu = sw_perf_event_init(event); + break; + + case PERF_TYPE_TRACEPOINT: + pmu = tp_perf_event_init(event); + break; + + default: + break; + } +done: + err = 0; + if (!pmu) + err = -EINVAL; + else if (IS_ERR(pmu)) + err = PTR_ERR(pmu); + + if (err) { + if (event->ns) + put_pid_ns(event->ns); + kfree(event); + return ERR_PTR(err); + } + + event->pmu = pmu; + + if (!event->parent) { + atomic_inc(&nr_events); + if (event->attr.mmap) + atomic_inc(&nr_mmap_events); + if (event->attr.comm) + atomic_inc(&nr_comm_events); + if (event->attr.task) + atomic_inc(&nr_task_events); + } + + return event; +} + +static int perf_copy_attr(struct perf_event_attr __user *uattr, + struct perf_event_attr *attr) +{ + u32 size; + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = PERF_ATTR_SIZE_VER0; + + if (size < PERF_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(*attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + size = sizeof(*attr); + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * If the type exists, the corresponding creation will verify + * the attr->config. + */ + if (attr->type >= PERF_TYPE_MAX) + return -EINVAL; + + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) + return -EINVAL; + + if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) + return -EINVAL; + + if (attr->read_format & ~(PERF_FORMAT_MAX-1)) + return -EINVAL; + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + +int perf_event_set_output(struct perf_event *event, int output_fd) +{ + struct perf_event *output_event = NULL; + struct file *output_file = NULL; + struct perf_event *old_output; + int fput_needed = 0; + int ret = -EINVAL; + + if (!output_fd) + goto set; + + output_file = fget_light(output_fd, &fput_needed); + if (!output_file) + return -EBADF; + + if (output_file->f_op != &perf_fops) + goto out; + + output_event = output_file->private_data; + + /* Don't chain output fds */ + if (output_event->output) + goto out; + + /* Don't set an output fd when we already have an output channel */ + if (event->data) + goto out; + + atomic_long_inc(&output_file->f_count); + +set: + mutex_lock(&event->mmap_mutex); + old_output = event->output; + rcu_assign_pointer(event->output, output_event); + mutex_unlock(&event->mmap_mutex); + + if (old_output) { + /* + * we need to make sure no existing perf_output_*() + * is still referencing this event. + */ + synchronize_rcu(); + fput(old_output->filp); + } + + ret = 0; +out: + fput_light(output_file, fput_needed); + return ret; +} + +/** + * sys_perf_event_open - open a performance event, associate it to a task/cpu + * + * @attr_uptr: event_id type attributes for monitoring/sampling + * @pid: target pid + * @cpu: target cpu + * @group_fd: group leader event fd + */ +SYSCALL_DEFINE5(perf_event_open, + struct perf_event_attr __user *, attr_uptr, + pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) +{ + struct perf_event *event, *group_leader; + struct perf_event_attr attr; + struct perf_event_context *ctx; + struct file *event_file = NULL; + struct file *group_file = NULL; + int fput_needed = 0; + int fput_needed2 = 0; + int err; + + /* for future expandability... */ + if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) + return -EINVAL; + + err = perf_copy_attr(attr_uptr, &attr); + if (err) + return err; + + if (!attr.exclude_kernel) { + if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + } + + if (attr.freq) { + if (attr.sample_freq > sysctl_perf_event_sample_rate) + return -EINVAL; + } + + /* + * Get the target context (task or percpu): + */ + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + /* + * Look up the group leader (we will attach this event to it): + */ + group_leader = NULL; + if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { + err = -EINVAL; + group_file = fget_light(group_fd, &fput_needed); + if (!group_file) + goto err_put_context; + if (group_file->f_op != &perf_fops) + goto err_put_context; + + group_leader = group_file->private_data; + /* + * Do not allow a recursive hierarchy (this new sibling + * becoming part of another group-sibling): + */ + if (group_leader->group_leader != group_leader) + goto err_put_context; + /* + * Do not allow to attach to a group in a different + * task or CPU context: + */ + if (group_leader->ctx != ctx) + goto err_put_context; + /* + * Only a group leader can be exclusive or pinned + */ + if (attr.exclusive || attr.pinned) + goto err_put_context; + } + + event = perf_event_alloc(&attr, cpu, ctx, group_leader, + NULL, GFP_KERNEL); + err = PTR_ERR(event); + if (IS_ERR(event)) + goto err_put_context; + + err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); + if (err < 0) + goto err_free_put_context; + + event_file = fget_light(err, &fput_needed2); + if (!event_file) + goto err_free_put_context; + + if (flags & PERF_FLAG_FD_OUTPUT) { + err = perf_event_set_output(event, group_fd); + if (err) + goto err_fput_free_put_context; + } + + event->filp = event_file; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + mutex_unlock(&ctx->mutex); + + event->owner = current; + get_task_struct(current); + mutex_lock(¤t->perf_event_mutex); + list_add_tail(&event->owner_entry, ¤t->perf_event_list); + mutex_unlock(¤t->perf_event_mutex); + +err_fput_free_put_context: + fput_light(event_file, fput_needed2); + +err_free_put_context: + if (err < 0) + kfree(event); + +err_put_context: + if (err < 0) + put_ctx(ctx); + + fput_light(group_file, fput_needed); + + return err; +} + +/* + * inherit a event from parent task to child task: + */ +static struct perf_event * +inherit_event(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event *group_leader, + struct perf_event_context *child_ctx) +{ + struct perf_event *child_event; + + /* + * Instead of creating recursive hierarchies of events, + * we link inherited events back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_event->parent) + parent_event = parent_event->parent; + + child_event = perf_event_alloc(&parent_event->attr, + parent_event->cpu, child_ctx, + group_leader, parent_event, + GFP_KERNEL); + if (IS_ERR(child_event)) + return child_event; + get_ctx(child_ctx); + + /* + * Make the child state follow the state of the parent event, + * not its attr.disabled bit. We hold the parent's mutex, + * so we won't race with perf_event_{en, dis}able_family. + */ + if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + child_event->state = PERF_EVENT_STATE_INACTIVE; + else + child_event->state = PERF_EVENT_STATE_OFF; + + if (parent_event->attr.freq) + child_event->hw.sample_period = parent_event->hw.sample_period; + + /* + * Link it up in the child's context: + */ + add_event_to_ctx(child_event, child_ctx); + + /* + * Get a reference to the parent filp - we will fput it + * when the child event exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_event->filp->f_count); + + /* + * Link this into the parent event's child list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_add_tail(&child_event->child_list, &parent_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + return child_event; +} + +static int inherit_group(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event_context *child_ctx) +{ + struct perf_event *leader; + struct perf_event *sub; + struct perf_event *child_ctr; + + leader = inherit_event(parent_event, parent, parent_ctx, + child, NULL, child_ctx); + if (IS_ERR(leader)) + return PTR_ERR(leader); + list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + child_ctr = inherit_event(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); + } + return 0; +} + +static void sync_child_event(struct perf_event *child_event, + struct task_struct *child) +{ + struct perf_event *parent_event = child_event->parent; + u64 child_val; + + if (child_event->attr.inherit_stat) + perf_event_read_event(child_event, child); + + child_val = atomic64_read(&child_event->count); + + /* + * Add back the child's count to the parent's count: + */ + atomic64_add(child_val, &parent_event->count); + atomic64_add(child_event->total_time_enabled, + &parent_event->child_total_time_enabled); + atomic64_add(child_event->total_time_running, + &parent_event->child_total_time_running); + + /* + * Remove this event from the parent's list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + /* + * Release the parent event, if this was the last + * reference to it. + */ + fput(parent_event->filp); +} + +static void +__perf_event_exit_task(struct perf_event *child_event, + struct perf_event_context *child_ctx, + struct task_struct *child) +{ + struct perf_event *parent_event; + + update_event_times(child_event); + perf_event_remove_from_context(child_event); + + parent_event = child_event->parent; + /* + * It can happen that parent exits first, and has events + * that are still around due to the child reference. These + * events need to be zapped - but otherwise linger. + */ + if (parent_event) { + sync_child_event(child_event, child); + free_event(child_event); + } +} + +/* + * When a child task exits, feed back event values to parent events. + */ +void perf_event_exit_task(struct task_struct *child) +{ + struct perf_event *child_event, *tmp; + struct perf_event_context *child_ctx; + unsigned long flags; + + if (likely(!child->perf_event_ctxp)) { + perf_event_task(child, NULL, 0); + return; + } + + local_irq_save(flags); + /* + * We can't reschedule here because interrupts are disabled, + * and either child is current or it is a task that can't be + * scheduled, so we are now safe from rescheduling changing + * our context. + */ + child_ctx = child->perf_event_ctxp; + __perf_event_task_sched_out(child_ctx); + + /* + * Take the context lock here so that if find_get_context is + * reading child->perf_event_ctxp, we wait until it has + * incremented the context's refcount before we do put_ctx below. + */ + spin_lock(&child_ctx->lock); + child->perf_event_ctxp = NULL; + /* + * If this context is a clone; unclone it so it can't get + * swapped to another process while we're removing all + * the events from it. + */ + unclone_ctx(child_ctx); + spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Report the task dead after unscheduling the events so that we + * won't get any samples after PERF_RECORD_EXIT. We can however still + * get a few PERF_RECORD_READ events. + */ + perf_event_task(child, child_ctx, 0); + + /* + * We can recurse on the same lock type through: + * + * __perf_event_exit_task() + * sync_child_event() + * fput(parent_event->filp) + * perf_release() + * mutex_lock(&ctx->mutex) + * + * But since its the parent context it won't be the same instance. + */ + mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); + +again: + list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); + + /* + * If the last event was a group event, it will have appended all + * its siblings to the list, but we obtained 'tmp' before that which + * will still point to the list head terminating the iteration. + */ + if (!list_empty(&child_ctx->group_list)) + goto again; + + mutex_unlock(&child_ctx->mutex); + + put_ctx(child_ctx); +} + +/* + * free an unexposed, unused context as created by inheritance by + * init_task below, used by fork() in case of fail. + */ +void perf_event_free_task(struct task_struct *task) +{ + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event *event, *tmp; + + if (!ctx) + return; + + mutex_lock(&ctx->mutex); +again: + list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { + struct perf_event *parent = event->parent; + + if (WARN_ON_ONCE(!parent)) + continue; + + mutex_lock(&parent->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + list_del_event(event, ctx); + free_event(event); + } + + if (!list_empty(&ctx->group_list)) + goto again; + + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); +} + +/* + * Initialize the perf_event context in task_struct + */ +int perf_event_init_task(struct task_struct *child) +{ + struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *cloned_ctx; + struct perf_event *event; + struct task_struct *parent = current; + int inherited_all = 1; + int ret = 0; + + child->perf_event_ctxp = NULL; + + mutex_init(&child->perf_event_mutex); + INIT_LIST_HEAD(&child->perf_event_list); + + if (likely(!parent->perf_event_ctxp)) + return 0; + + /* + * This is executed from the parent task context, so inherit + * events that have been marked for cloning. + * First allocate and initialize a context for the child. + */ + + child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + if (!child_ctx) + return -ENOMEM; + + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; + get_task_struct(child); + + /* + * If the parent's context is a clone, pin it so it won't get + * swapped under us. + */ + parent_ctx = perf_pin_task_context(parent); + + /* + * No need to check if parent_ctx != NULL here; since we saw + * it non-NULL earlier, the only reason for it to become NULL + * is if we exit, and since we're currently in the middle of + * a fork we can't be exiting at the same time. + */ + + /* + * Lock the parent list. No need to lock the child - not PID + * hashed yet and not running, so nobody can access it. + */ + mutex_lock(&parent_ctx->mutex); + + /* + * We dont have to disable NMIs - we are only looking at + * the list, not manipulating it: + */ + list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) { + if (event != event->group_leader) + continue; + + if (!event->attr.inherit) { + inherited_all = 0; + continue; + } + + ret = inherit_group(event, parent, parent_ctx, + child, child_ctx); + if (ret) { + inherited_all = 0; + break; + } + } + + if (inherited_all) { + /* + * Mark the child context as a clone of the parent + * context, or of whatever the parent is a clone of. + * Note that if the parent is a clone, it could get + * uncloned at any point, but that doesn't matter + * because the list of events and the generation + * count can't have changed since we took the mutex. + */ + cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); + if (cloned_ctx) { + child_ctx->parent_ctx = cloned_ctx; + child_ctx->parent_gen = parent_ctx->parent_gen; + } else { + child_ctx->parent_ctx = parent_ctx; + child_ctx->parent_gen = parent_ctx->generation; + } + get_ctx(child_ctx->parent_ctx); + } + + mutex_unlock(&parent_ctx->mutex); + + perf_unpin_context(parent_ctx); + + return ret; +} + +static void __cpuinit perf_event_init_cpu(int cpu) +{ + struct perf_cpu_context *cpuctx; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx, NULL); + + spin_lock(&perf_resource_lock); + cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; + spin_unlock(&perf_resource_lock); + + hw_perf_event_setup(cpu); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __perf_event_exit_cpu(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_event *event, *tmp; + + list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) + __perf_event_remove_from_context(event); +} +static void perf_event_exit_cpu(int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = &cpuctx->ctx; + + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); + mutex_unlock(&ctx->mutex); +} +#else +static inline void perf_event_exit_cpu(int cpu) { } +#endif + +static int __cpuinit +perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action) { + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + perf_event_init_cpu(cpu); + break; + + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_event_setup_online(cpu); + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + perf_event_exit_cpu(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +/* + * This has to have a higher priority than migration_notifier in sched.c. + */ +static struct notifier_block __cpuinitdata perf_cpu_nb = { + .notifier_call = perf_cpu_notify, + .priority = 20, +}; + +void __init perf_event_init(void) +{ + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&perf_cpu_nb); +} + +static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_reserved_percpu); +} + +static ssize_t +perf_set_reserve_percpu(struct sysdev_class *class, + const char *buf, + size_t count) +{ + struct perf_cpu_context *cpuctx; + unsigned long val; + int err, cpu, mpt; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > perf_max_events) + return -EINVAL; + + spin_lock(&perf_resource_lock); + perf_reserved_percpu = val; + for_each_online_cpu(cpu) { + cpuctx = &per_cpu(perf_cpu_context, cpu); + spin_lock_irq(&cpuctx->ctx.lock); + mpt = min(perf_max_events - cpuctx->ctx.nr_events, + perf_max_events - perf_reserved_percpu); + cpuctx->max_pertask = mpt; + spin_unlock_irq(&cpuctx->ctx.lock); + } + spin_unlock(&perf_resource_lock); + + return count; +} + +static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_overcommit); +} + +static ssize_t +perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) +{ + unsigned long val; + int err; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > 1) + return -EINVAL; + + spin_lock(&perf_resource_lock); + perf_overcommit = val; + spin_unlock(&perf_resource_lock); + + return count; +} + +static SYSDEV_CLASS_ATTR( + reserve_percpu, + 0644, + perf_show_reserve_percpu, + perf_set_reserve_percpu + ); + +static SYSDEV_CLASS_ATTR( + overcommit, + 0644, + perf_show_overcommit, + perf_set_overcommit + ); + +static struct attribute *perfclass_attrs[] = { + &attr_reserve_percpu.attr, + &attr_overcommit.attr, + NULL +}; + +static struct attribute_group perfclass_attr_group = { + .attrs = perfclass_attrs, + .name = "perf_events", +}; + +static int __init perf_event_sysfs_init(void) +{ + return sysfs_create_group(&cpu_sysdev_class.kset.kobj, + &perfclass_attr_group); +} +device_initcall(perf_event_sysfs_init); diff --git a/kernel/pid.c b/kernel/pid.c index 31310b5..d3f722d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -40,7 +40,7 @@ #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) static struct hlist_head *pid_hash; -static int pidhash_shift; +static unsigned int pidhash_shift = 4; struct pid init_struct_pid = INIT_STRUCT_PID; int pid_max = PID_MAX_DEFAULT; @@ -499,19 +499,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) void __init pidhash_init(void) { int i, pidhash_size; - unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); - pidhash_shift = max(4, fls(megabytes * 4)); - pidhash_shift = min(12, pidhash_shift); + pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, + HASH_EARLY | HASH_SMALL, + &pidhash_shift, NULL, 4096); pidhash_size = 1 << pidhash_shift; - printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", - pidhash_size, pidhash_shift, - pidhash_size * sizeof(struct hlist_head)); - - pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash))); - if (!pid_hash) - panic("Could not alloc pidhash!\n"); for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e33a21c..5c9dc22 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -8,17 +8,18 @@ #include <linux/math64.h> #include <asm/uaccess.h> #include <linux/kernel_stat.h> +#include <trace/events/timer.h> /* * Called after updating RLIMIT_CPU to set timer expiration if necessary. */ void update_rlimit_cpu(unsigned long rlim_new) { - cputime_t cputime; + cputime_t cputime = secs_to_cputime(rlim_new); + struct signal_struct *const sig = current->signal; - cputime = secs_to_cputime(rlim_new); - if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || - cputime_gt(current->signal->it_prof_expires, cputime)) { + if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) || + cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) { spin_lock_irq(¤t->sighand->siglock); set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); spin_unlock_irq(¤t->sighand->siglock); @@ -542,6 +543,17 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) now); } +static inline int expires_gt(cputime_t expires, cputime_t new_exp) +{ + return cputime_eq(expires, cputime_zero) || + cputime_gt(expires, new_exp); +} + +static inline int expires_le(cputime_t expires, cputime_t new_exp) +{ + return !cputime_eq(expires, cputime_zero) && + cputime_le(expires, new_exp); +} /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held @@ -586,34 +598,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + union cpu_time_count *exp = &nt->expires; + switch (CPUCLOCK_WHICH(timer->it_clock)) { default: BUG(); case CPUCLOCK_PROF: - if (cputime_eq(p->cputime_expires.prof_exp, - cputime_zero) || - cputime_gt(p->cputime_expires.prof_exp, - nt->expires.cpu)) - p->cputime_expires.prof_exp = - nt->expires.cpu; + if (expires_gt(p->cputime_expires.prof_exp, + exp->cpu)) + p->cputime_expires.prof_exp = exp->cpu; break; case CPUCLOCK_VIRT: - if (cputime_eq(p->cputime_expires.virt_exp, - cputime_zero) || - cputime_gt(p->cputime_expires.virt_exp, - nt->expires.cpu)) - p->cputime_expires.virt_exp = - nt->expires.cpu; + if (expires_gt(p->cputime_expires.virt_exp, + exp->cpu)) + p->cputime_expires.virt_exp = exp->cpu; break; case CPUCLOCK_SCHED: if (p->cputime_expires.sched_exp == 0 || - p->cputime_expires.sched_exp > - nt->expires.sched) + p->cputime_expires.sched_exp > exp->sched) p->cputime_expires.sched_exp = - nt->expires.sched; + exp->sched; break; } } else { + struct signal_struct *const sig = p->signal; + union cpu_time_count *exp = &timer->it.cpu.expires; + /* * For a process timer, set the cached expiration time. */ @@ -621,30 +631,23 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) default: BUG(); case CPUCLOCK_VIRT: - if (!cputime_eq(p->signal->it_virt_expires, - cputime_zero) && - cputime_lt(p->signal->it_virt_expires, - timer->it.cpu.expires.cpu)) + if (expires_le(sig->it[CPUCLOCK_VIRT].expires, + exp->cpu)) break; - p->signal->cputime_expires.virt_exp = - timer->it.cpu.expires.cpu; + sig->cputime_expires.virt_exp = exp->cpu; break; case CPUCLOCK_PROF: - if (!cputime_eq(p->signal->it_prof_expires, - cputime_zero) && - cputime_lt(p->signal->it_prof_expires, - timer->it.cpu.expires.cpu)) + if (expires_le(sig->it[CPUCLOCK_PROF].expires, + exp->cpu)) break; - i = p->signal->rlim[RLIMIT_CPU].rlim_cur; + i = sig->rlim[RLIMIT_CPU].rlim_cur; if (i != RLIM_INFINITY && - i <= cputime_to_secs(timer->it.cpu.expires.cpu)) + i <= cputime_to_secs(exp->cpu)) break; - p->signal->cputime_expires.prof_exp = - timer->it.cpu.expires.cpu; + sig->cputime_expires.prof_exp = exp->cpu; break; case CPUCLOCK_SCHED: - p->signal->cputime_expires.sched_exp = - timer->it.cpu.expires.sched; + sig->cputime_expires.sched_exp = exp->sched; break; } } @@ -1071,6 +1074,40 @@ static void stop_process_timers(struct task_struct *tsk) spin_unlock_irqrestore(&cputimer->lock, flags); } +static u32 onecputick; + +static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, + cputime_t *expires, cputime_t cur_time, int signo) +{ + if (cputime_eq(it->expires, cputime_zero)) + return; + + if (cputime_ge(cur_time, it->expires)) { + if (!cputime_eq(it->incr, cputime_zero)) { + it->expires = cputime_add(it->expires, it->incr); + it->error += it->incr_error; + if (it->error >= onecputick) { + it->expires = cputime_sub(it->expires, + cputime_one_jiffy); + it->error -= onecputick; + } + } else { + it->expires = cputime_zero; + } + + trace_itimer_expire(signo == SIGPROF ? + ITIMER_PROF : ITIMER_VIRTUAL, + tsk->signal->leader_pid, cur_time); + __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); + } + + if (!cputime_eq(it->expires, cputime_zero) && + (cputime_eq(*expires, cputime_zero) || + cputime_lt(it->expires, *expires))) { + *expires = it->expires; + } +} + /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1090,10 +1127,10 @@ static void check_process_timers(struct task_struct *tsk, * Don't sample the current process CPU clocks if there are no timers. */ if (list_empty(&timers[CPUCLOCK_PROF]) && - cputime_eq(sig->it_prof_expires, cputime_zero) && + cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) && sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && list_empty(&timers[CPUCLOCK_VIRT]) && - cputime_eq(sig->it_virt_expires, cputime_zero) && + cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && list_empty(&timers[CPUCLOCK_SCHED])) { stop_process_timers(tsk); return; @@ -1153,38 +1190,11 @@ static void check_process_timers(struct task_struct *tsk, /* * Check for the special case process timers. */ - if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { - if (cputime_ge(ptime, sig->it_prof_expires)) { - /* ITIMER_PROF fires and reloads. */ - sig->it_prof_expires = sig->it_prof_incr; - if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { - sig->it_prof_expires = cputime_add( - sig->it_prof_expires, ptime); - } - __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk); - } - if (!cputime_eq(sig->it_prof_expires, cputime_zero) && - (cputime_eq(prof_expires, cputime_zero) || - cputime_lt(sig->it_prof_expires, prof_expires))) { - prof_expires = sig->it_prof_expires; - } - } - if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { - if (cputime_ge(utime, sig->it_virt_expires)) { - /* ITIMER_VIRTUAL fires and reloads. */ - sig->it_virt_expires = sig->it_virt_incr; - if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { - sig->it_virt_expires = cputime_add( - sig->it_virt_expires, utime); - } - __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk); - } - if (!cputime_eq(sig->it_virt_expires, cputime_zero) && - (cputime_eq(virt_expires, cputime_zero) || - cputime_lt(sig->it_virt_expires, virt_expires))) { - virt_expires = sig->it_virt_expires; - } - } + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, + SIGPROF); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, + SIGVTALRM); + if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { unsigned long psecs = cputime_to_secs(ptime); cputime_t x; @@ -1457,7 +1467,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, if (!cputime_eq(*oldval, cputime_zero)) { if (cputime_le(*oldval, now.cpu)) { /* Just about to fire. */ - *oldval = jiffies_to_cputime(1); + *oldval = cputime_one_jiffy; } else { *oldval = cputime_sub(*oldval, now.cpu); } @@ -1703,10 +1713,15 @@ static __init int init_posix_cpu_timers(void) .nsleep = thread_cpu_nsleep, .nsleep_restart = thread_cpu_nsleep_restart, }; + struct timespec ts; register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); + cputime_to_timespec(cputime_one_jiffy, &ts); + onecputick = ts.tv_nsec; + WARN_ON(ts.tv_sec != 0); + return 0; } __initcall(init_posix_cpu_timers); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index d089d05..4954407 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) return 0; } + +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) +{ + *tp = current_kernel_time(); + return 0; +} + +static int posix_get_monotonic_coarse(clockid_t which_clock, + struct timespec *tp) +{ + *tp = get_monotonic_coarse(); + return 0; +} + +int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) +{ + *tp = ktime_to_timespec(KTIME_LOW_RES); + return 0; +} /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ @@ -262,10 +281,26 @@ static __init int init_posix_timers(void) .timer_create = no_timer_create, .nsleep = no_nsleep, }; + struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, + }; + struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, + }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); + register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); + register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 72067cb..91e09d3 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -208,3 +208,17 @@ config APM_EMULATION random kernel OOPSes or reboots that don't seem to be related to anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). + +config PM_RUNTIME + bool "Run-time PM core functionality" + depends on PM + ---help--- + Enable functionality allowing I/O devices to be put into energy-saving + (low power) states at run time (or autosuspended) after a specified + period of inactivity and woken up in response to a hardware-generated + wake-up event or a driver's request. + + Hardware support is generally required for this functionality to work + and the bus type drivers of the buses the devices are on are + responsible for the actual handling of the autosuspend requests and + wake-up events. diff --git a/kernel/power/console.c b/kernel/power/console.c index a3961b20..5187136 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -14,56 +14,13 @@ #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; -static int disable_vt_switch; - -/* - * Normally during a suspend, we allocate a new console and switch to it. - * When we resume, we switch back to the original console. This switch - * can be slow, so on systems where the framebuffer can handle restoration - * of video registers anyways, there's little point in doing the console - * switch. This function allows you to disable it by passing it '0'. - */ -void pm_set_vt_switch(int do_switch) -{ - acquire_console_sem(); - disable_vt_switch = !do_switch; - release_console_sem(); -} -EXPORT_SYMBOL(pm_set_vt_switch); int pm_prepare_console(void) { - acquire_console_sem(); - - if (disable_vt_switch) { - release_console_sem(); - return 0; - } - - orig_fgconsole = fg_console; - - if (vc_allocate(SUSPEND_CONSOLE)) { - /* we can't have a free VC for now. Too bad, - * we don't want to mess the screen for now. */ - release_console_sem(); + orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); + if (orig_fgconsole < 0) return 1; - } - if (set_console(SUSPEND_CONSOLE)) { - /* - * We're unable to switch to the SUSPEND_CONSOLE. - * Let the calling function know so it can decide - * what to do. - */ - release_console_sem(); - return 1; - } - release_console_sem(); - - if (vt_waitactive(SUSPEND_CONSOLE)) { - pr_debug("Suspend: Can't switch VCs."); - return 1; - } orig_kmsg = kmsg_redirect; kmsg_redirect = SUSPEND_CONSOLE; return 0; @@ -71,19 +28,9 @@ int pm_prepare_console(void) void pm_restore_console(void) { - acquire_console_sem(); - if (disable_vt_switch) { - release_console_sem(); - return; - } - set_console(orig_fgconsole); - release_console_sem(); - - if (vt_waitactive(orig_fgconsole)) { - pr_debug("Resume: Can't switch VCs."); - return; + if (orig_fgconsole >= 0) { + vt_move_to_console(orig_fgconsole, 0); + kmsg_redirect = orig_kmsg; } - - kmsg_redirect = orig_kmsg; } #endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 81d2e74..04b3a83 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -298,8 +298,8 @@ int hibernation_snapshot(int platform_mode) if (error) return error; - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); + /* Preallocate image memory before shutting down devices. */ + error = hibernate_preallocate_memory(); if (error) goto Close; @@ -315,6 +315,10 @@ int hibernation_snapshot(int platform_mode) /* Control returns here after successful restore */ Resume_devices: + /* We may need to release the preallocated image pages here. */ + if (error || !in_suspend) + swsusp_free(); + dpm_resume_end(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); resume_console(); @@ -460,11 +464,11 @@ int hibernation_platform_enter(void) error = hibernation_ops->prepare(); if (error) - goto Platofrm_finish; + goto Platform_finish; error = disable_nonboot_cpus(); if (error) - goto Platofrm_finish; + goto Platform_finish; local_irq_disable(); sysdev_suspend(PMSG_HIBERNATE); @@ -476,7 +480,7 @@ int hibernation_platform_enter(void) * We don't need to reenable the nonboot CPUs or resume consoles, since * the system is going to be halted anyway. */ - Platofrm_finish: + Platform_finish: hibernation_ops->finish(); dpm_suspend_noirq(PMSG_RESTORE); @@ -578,7 +582,10 @@ int hibernate(void) goto Thaw; error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); - if (in_suspend && !error) { + if (error) + goto Thaw; + + if (in_suspend) { unsigned int flags = 0; if (hibernation_mode == HIBERNATION_PLATFORM) @@ -590,8 +597,8 @@ int hibernate(void) power_down(); } else { pr_debug("PM: Image restored successfully.\n"); - swsusp_free(); } + Thaw: thaw_processes(); Finish: diff --git a/kernel/power/main.c b/kernel/power/main.c index f710e36..347d2cc 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -11,6 +11,7 @@ #include <linux/kobject.h> #include <linux/string.h> #include <linux/resume-trace.h> +#include <linux/workqueue.h> #include "power.h" @@ -217,8 +218,24 @@ static struct attribute_group attr_group = { .attrs = g, }; +#ifdef CONFIG_PM_RUNTIME +struct workqueue_struct *pm_wq; + +static int __init pm_start_workqueue(void) +{ + pm_wq = create_freezeable_workqueue("pm"); + + return pm_wq ? 0 : -ENOMEM; +} +#else +static inline int pm_start_workqueue(void) { return 0; } +#endif + static int __init pm_init(void) { + int error = pm_start_workqueue(); + if (error) + return error; power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; diff --git a/kernel/power/power.h b/kernel/power/power.h index 26d5a26..46c5a26 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void); extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); -extern int swsusp_shrink_memory(void); +extern int hibernate_preallocate_memory(void); /** * Auxiliary structure used for reading the snapshot image data and diff --git a/kernel/power/process.c b/kernel/power/process.c index da2072d..cc2e553 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -9,6 +9,7 @@ #undef DEBUG #include <linux/interrupt.h> +#include <linux/oom.h> #include <linux/suspend.h> #include <linux/module.h> #include <linux/syscalls.h> diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 523a451..36cb168 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -233,7 +233,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) #define BM_END_OF_MAP (~0UL) -#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) +#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) struct bm_block { struct list_head hook; /* hook into a list of bitmap blocks */ @@ -275,7 +275,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); /** * create_bm_block_list - create a list of block bitmap objects - * @nr_blocks - number of blocks to allocate + * @pages - number of pages to track * @list - list to put the allocated blocks into * @ca - chain allocator to be used for allocating memory */ @@ -619,7 +619,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, BUG_ON(!region); } else /* This allocation cannot fail */ - region = alloc_bootmem_low(sizeof(struct nosave_region)); + region = alloc_bootmem(sizeof(struct nosave_region)); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); @@ -853,7 +853,7 @@ static unsigned int count_highmem_pages(void) struct zone *zone; unsigned int n = 0; - for_each_zone(zone) { + for_each_populated_zone(zone) { unsigned long pfn, max_zone_pfn; if (!is_highmem(zone)) @@ -916,7 +916,7 @@ static unsigned int count_data_pages(void) unsigned long pfn, max_zone_pfn; unsigned int n = 0; - for_each_zone(zone) { + for_each_populated_zone(zone) { if (is_highmem(zone)) continue; @@ -1010,7 +1010,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) struct zone *zone; unsigned long pfn; - for_each_zone(zone) { + for_each_populated_zone(zone) { unsigned long max_zone_pfn; mark_free_pages(zone); @@ -1033,6 +1033,25 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) static unsigned int nr_copy_pages; /* Number of pages needed for saving the original pfns of the image pages */ static unsigned int nr_meta_pages; +/* + * Numbers of normal and highmem page frames allocated for hibernation image + * before suspending devices. + */ +unsigned int alloc_normal, alloc_highmem; +/* + * Memory bitmap used for marking saveable pages (during hibernation) or + * hibernation image pages (during restore) + */ +static struct memory_bitmap orig_bm; +/* + * Memory bitmap used during hibernation for marking allocated page frames that + * will contain copies of saveable pages. During restore it is initially used + * for marking hibernation image pages, but then the set bits from it are + * duplicated in @orig_bm and it is released. On highmem systems it is next + * used for marking "safe" highmem pages, but it has to be reinitialized for + * this purpose. + */ +static struct memory_bitmap copy_bm; /** * swsusp_free - free pages allocated for the suspend. @@ -1046,7 +1065,7 @@ void swsusp_free(void) struct zone *zone; unsigned long pfn, max_zone_pfn; - for_each_zone(zone) { + for_each_populated_zone(zone) { max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { @@ -1064,74 +1083,286 @@ void swsusp_free(void) nr_meta_pages = 0; restore_pblist = NULL; buffer = NULL; + alloc_normal = 0; + alloc_highmem = 0; } +/* Helper functions used for the shrinking of memory. */ + +#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) + /** - * swsusp_shrink_memory - Try to free as much memory as needed - * - * ... but do not OOM-kill anyone + * preallocate_image_pages - Allocate a number of pages for hibernation image + * @nr_pages: Number of page frames to allocate. + * @mask: GFP flags to use for the allocation. * - * Notice: all userland should be stopped before it is called, or - * livelock is possible. + * Return value: Number of page frames actually allocated + */ +static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) +{ + unsigned long nr_alloc = 0; + + while (nr_pages > 0) { + struct page *page; + + page = alloc_image_page(mask); + if (!page) + break; + memory_bm_set_bit(©_bm, page_to_pfn(page)); + if (PageHighMem(page)) + alloc_highmem++; + else + alloc_normal++; + nr_pages--; + nr_alloc++; + } + + return nr_alloc; +} + +static unsigned long preallocate_image_memory(unsigned long nr_pages) +{ + return preallocate_image_pages(nr_pages, GFP_IMAGE); +} + +#ifdef CONFIG_HIGHMEM +static unsigned long preallocate_image_highmem(unsigned long nr_pages) +{ + return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM); +} + +/** + * __fraction - Compute (an approximation of) x * (multiplier / base) */ +static unsigned long __fraction(u64 x, u64 multiplier, u64 base) +{ + x *= multiplier; + do_div(x, base); + return (unsigned long)x; +} + +static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, + unsigned long highmem, + unsigned long total) +{ + unsigned long alloc = __fraction(nr_pages, highmem, total); -#define SHRINK_BITE 10000 -static inline unsigned long __shrink_memory(long tmp) + return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM); +} +#else /* CONFIG_HIGHMEM */ +static inline unsigned long preallocate_image_highmem(unsigned long nr_pages) { - if (tmp > SHRINK_BITE) - tmp = SHRINK_BITE; - return shrink_all_memory(tmp); + return 0; } -int swsusp_shrink_memory(void) +static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, + unsigned long highmem, + unsigned long total) +{ + return 0; +} +#endif /* CONFIG_HIGHMEM */ + +/** + * free_unnecessary_pages - Release preallocated pages not needed for the image + */ +static void free_unnecessary_pages(void) +{ + unsigned long save_highmem, to_free_normal, to_free_highmem; + + to_free_normal = alloc_normal - count_data_pages(); + save_highmem = count_highmem_pages(); + if (alloc_highmem > save_highmem) { + to_free_highmem = alloc_highmem - save_highmem; + } else { + to_free_highmem = 0; + to_free_normal -= save_highmem - alloc_highmem; + } + + memory_bm_position_reset(©_bm); + + while (to_free_normal > 0 && to_free_highmem > 0) { + unsigned long pfn = memory_bm_next_pfn(©_bm); + struct page *page = pfn_to_page(pfn); + + if (PageHighMem(page)) { + if (!to_free_highmem) + continue; + to_free_highmem--; + alloc_highmem--; + } else { + if (!to_free_normal) + continue; + to_free_normal--; + alloc_normal--; + } + memory_bm_clear_bit(©_bm, pfn); + swsusp_unset_page_forbidden(page); + swsusp_unset_page_free(page); + __free_page(page); + } +} + +/** + * minimum_image_size - Estimate the minimum acceptable size of an image + * @saveable: Number of saveable pages in the system. + * + * We want to avoid attempting to free too much memory too hard, so estimate the + * minimum acceptable size of a hibernation image to use as the lower limit for + * preallocating memory. + * + * We assume that the minimum image size should be proportional to + * + * [number of saveable pages] - [number of pages that can be freed in theory] + * + * where the second term is the sum of (1) reclaimable slab pages, (2) active + * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages, + * minus mapped file pages. + */ +static unsigned long minimum_image_size(unsigned long saveable) +{ + unsigned long size; + + size = global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_INACTIVE_ANON) + + global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_FILE) + - global_page_state(NR_FILE_MAPPED); + + return saveable <= size ? 0 : saveable - size; +} + +/** + * hibernate_preallocate_memory - Preallocate memory for hibernation image + * + * To create a hibernation image it is necessary to make a copy of every page + * frame in use. We also need a number of page frames to be free during + * hibernation for allocations made while saving the image and for device + * drivers, in case they need to allocate memory from their hibernation + * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES, + * respectively, both of which are rough estimates). To make this happen, we + * compute the total number of available page frames and allocate at least + * + * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES + * + * of them, which corresponds to the maximum size of a hibernation image. + * + * If image_size is set below the number following from the above formula, + * the preallocation of memory is continued until the total number of saveable + * pages in the system is below the requested image size or the minimum + * acceptable image size returned by minimum_image_size(), whichever is greater. + */ +int hibernate_preallocate_memory(void) { - long tmp; struct zone *zone; - unsigned long pages = 0; - unsigned int i = 0; - char *p = "-\\|/"; + unsigned long saveable, size, max_size, count, highmem, pages = 0; + unsigned long alloc, save_highmem, pages_highmem; struct timeval start, stop; + int error; - printk(KERN_INFO "PM: Shrinking memory... "); + printk(KERN_INFO "PM: Preallocating image memory... "); do_gettimeofday(&start); - do { - long size, highmem_size; - - highmem_size = count_highmem_pages(); - size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; - tmp = size; - size += highmem_size; - for_each_populated_zone(zone) { - tmp += snapshot_additional_pages(zone); - if (is_highmem(zone)) { - highmem_size -= - zone_page_state(zone, NR_FREE_PAGES); - } else { - tmp -= zone_page_state(zone, NR_FREE_PAGES); - tmp += zone->lowmem_reserve[ZONE_NORMAL]; - } - } - if (highmem_size < 0) - highmem_size = 0; + error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); + if (error) + goto err_out; - tmp += highmem_size; - if (tmp > 0) { - tmp = __shrink_memory(tmp); - if (!tmp) - return -ENOMEM; - pages += tmp; - } else if (size > image_size / PAGE_SIZE) { - tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); - pages += tmp; - } - printk("\b%c", p[i++%4]); - } while (tmp > 0); + error = memory_bm_create(©_bm, GFP_IMAGE, PG_ANY); + if (error) + goto err_out; + + alloc_normal = 0; + alloc_highmem = 0; + + /* Count the number of saveable data pages. */ + save_highmem = count_highmem_pages(); + saveable = count_data_pages(); + + /* + * Compute the total number of page frames we can use (count) and the + * number of pages needed for image metadata (size). + */ + count = saveable; + saveable += save_highmem; + highmem = save_highmem; + size = 0; + for_each_populated_zone(zone) { + size += snapshot_additional_pages(zone); + if (is_highmem(zone)) + highmem += zone_page_state(zone, NR_FREE_PAGES); + else + count += zone_page_state(zone, NR_FREE_PAGES); + } + count += highmem; + count -= totalreserve_pages; + + /* Compute the maximum number of saveable pages to leave in memory. */ + max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; + size = DIV_ROUND_UP(image_size, PAGE_SIZE); + if (size > max_size) + size = max_size; + /* + * If the maximum is not less than the current number of saveable pages + * in memory, allocate page frames for the image and we're done. + */ + if (size >= saveable) { + pages = preallocate_image_highmem(save_highmem); + pages += preallocate_image_memory(saveable - pages); + goto out; + } + + /* Estimate the minimum size of the image. */ + pages = minimum_image_size(saveable); + if (size < pages) + size = min_t(unsigned long, pages, max_size); + + /* + * Let the memory management subsystem know that we're going to need a + * large number of page frames to allocate and make it free some memory. + * NOTE: If this is not done, performance will be hurt badly in some + * test cases. + */ + shrink_all_memory(saveable - size); + + /* + * The number of saveable pages in memory was too high, so apply some + * pressure to decrease it. First, make room for the largest possible + * image and fail if that doesn't work. Next, try to decrease the size + * of the image as much as indicated by 'size' using allocations from + * highmem and non-highmem zones separately. + */ + pages_highmem = preallocate_image_highmem(highmem / 2); + alloc = (count - max_size) - pages_highmem; + pages = preallocate_image_memory(alloc); + if (pages < alloc) + goto err_out; + size = max_size - size; + alloc = size; + size = preallocate_highmem_fraction(size, highmem, count); + pages_highmem += size; + alloc -= size; + pages += preallocate_image_memory(alloc); + pages += pages_highmem; + + /* + * We only need as many page frames for the image as there are saveable + * pages in memory, but we have allocated more. Release the excessive + * ones now. + */ + free_unnecessary_pages(); + + out: do_gettimeofday(&stop); - printk("\bdone (%lu pages freed)\n", pages); - swsusp_show_speed(&start, &stop, pages, "Freed"); + printk(KERN_CONT "done (allocated %lu pages)\n", pages); + swsusp_show_speed(&start, &stop, pages, "Allocated"); return 0; + + err_out: + printk(KERN_CONT "\n"); + swsusp_free(); + return -ENOMEM; } #ifdef CONFIG_HIGHMEM @@ -1142,7 +1373,7 @@ int swsusp_shrink_memory(void) static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { - unsigned int free_highmem = count_free_highmem_pages(); + unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; if (free_highmem >= nr_highmem) nr_highmem = 0; @@ -1164,19 +1395,17 @@ count_pages_for_highmem(unsigned int nr_highmem) { return 0; } static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) { struct zone *zone; - unsigned int free = 0, meta = 0; + unsigned int free = alloc_normal; - for_each_zone(zone) { - meta += snapshot_additional_pages(zone); + for_each_populated_zone(zone) if (!is_highmem(zone)) free += zone_page_state(zone, NR_FREE_PAGES); - } nr_pages += count_pages_for_highmem(nr_highmem); - pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n", - nr_pages, PAGES_FOR_IO, meta, free); + pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n", + nr_pages, PAGES_FOR_IO, free); - return free > nr_pages + PAGES_FOR_IO + meta; + return free > nr_pages + PAGES_FOR_IO; } #ifdef CONFIG_HIGHMEM @@ -1198,7 +1427,7 @@ static inline int get_highmem_buffer(int safe_needed) */ static inline unsigned int -alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) +alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) { unsigned int to_alloc = count_free_highmem_pages(); @@ -1218,7 +1447,7 @@ alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) static inline int get_highmem_buffer(int safe_needed) { return 0; } static inline unsigned int -alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } +alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } #endif /* CONFIG_HIGHMEM */ /** @@ -1237,51 +1466,36 @@ static int swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, unsigned int nr_pages, unsigned int nr_highmem) { - int error; - - error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY); - if (error) - goto Free; - - error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY); - if (error) - goto Free; + int error = 0; if (nr_highmem > 0) { error = get_highmem_buffer(PG_ANY); if (error) - goto Free; - - nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem); + goto err_out; + if (nr_highmem > alloc_highmem) { + nr_highmem -= alloc_highmem; + nr_pages += alloc_highmem_pages(copy_bm, nr_highmem); + } } - while (nr_pages-- > 0) { - struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); - - if (!page) - goto Free; + if (nr_pages > alloc_normal) { + nr_pages -= alloc_normal; + while (nr_pages-- > 0) { + struct page *page; - memory_bm_set_bit(copy_bm, page_to_pfn(page)); + page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); + if (!page) + goto err_out; + memory_bm_set_bit(copy_bm, page_to_pfn(page)); + } } + return 0; - Free: + err_out: swsusp_free(); - return -ENOMEM; + return error; } -/* Memory bitmap used for marking saveable pages (during suspend) or the - * suspend image pages (during resume) - */ -static struct memory_bitmap orig_bm; -/* Memory bitmap used on suspend for marking allocated pages that will contain - * the copies of saveable pages. During resume it is initially used for - * marking the suspend image pages, but then its set bits are duplicated in - * @orig_bm and it is released. Next, on systems with high memory, it may be - * used for marking "safe" highmem pages, but it has to be reinitialized for - * this purpose. - */ -static struct memory_bitmap copy_bm; - asmlinkage int swsusp_save(void) { unsigned int nr_pages, nr_highmem; @@ -1474,7 +1688,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) unsigned long pfn, max_zone_pfn; /* Clear page flags */ - for_each_zone(zone) { + for_each_populated_zone(zone) { max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) diff --git a/kernel/printk.c b/kernel/printk.c index b4d97b5..f38b07f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -37,6 +37,12 @@ #include <asm/uaccess.h> /* + * for_each_console() allows you to iterate on each console + */ +#define for_each_console(con) \ + for (con = console_drivers; con != NULL; con = con->next) + +/* * Architectures can override it: */ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) @@ -61,6 +67,8 @@ int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; +static int saved_console_loglevel = -1; + /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -198,12 +206,11 @@ __setup("log_buf_len=", log_buf_len_setup); #ifdef CONFIG_BOOT_PRINTK_DELAY static unsigned int boot_delay; /* msecs delay after each printk during bootup */ -static unsigned long long printk_delay_msec; /* per msec, based on boot_delay */ +static unsigned long long loops_per_msec; /* based on boot_delay */ static int __init boot_delay_setup(char *str) { unsigned long lpj; - unsigned long long loops_per_msec; lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ loops_per_msec = (unsigned long long)lpj / 1000 * HZ; @@ -212,10 +219,9 @@ static int __init boot_delay_setup(char *str) if (boot_delay > 10 * 1000) boot_delay = 0; - printk_delay_msec = loops_per_msec; - printk(KERN_DEBUG "boot_delay: %u, preset_lpj: %ld, lpj: %lu, " - "HZ: %d, printk_delay_msec: %llu\n", - boot_delay, preset_lpj, lpj, HZ, printk_delay_msec); + pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " + "HZ: %d, loops_per_msec: %llu\n", + boot_delay, preset_lpj, lpj, HZ, loops_per_msec); return 1; } __setup("boot_delay=", boot_delay_setup); @@ -228,7 +234,7 @@ static void boot_delay_msec(void) if (boot_delay == 0 || system_state != SYSTEM_BOOTING) return; - k = (unsigned long long)printk_delay_msec * boot_delay; + k = (unsigned long long)loops_per_msec * boot_delay; timeout = jiffies + msecs_to_jiffies(boot_delay); while (k) { @@ -372,10 +378,15 @@ int do_syslog(int type, char __user *buf, int len) logged_chars = 0; break; case 6: /* Disable logging to console */ + if (saved_console_loglevel == -1) + saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; case 7: /* Enable logging to console */ - console_loglevel = default_console_loglevel; + if (saved_console_loglevel != -1) { + console_loglevel = saved_console_loglevel; + saved_console_loglevel = -1; + } break; case 8: /* Set level of messages printed to console */ error = -EINVAL; @@ -384,6 +395,8 @@ int do_syslog(int type, char __user *buf, int len) if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; + /* Implicitly re-enable logging to console */ + saved_console_loglevel = -1; error = 0; break; case 9: /* Number of chars in the log buffer */ @@ -412,7 +425,7 @@ static void __call_console_drivers(unsigned start, unsigned end) { struct console *con; - for (con = console_drivers; con; con = con->next) { + for_each_console(con) { if ((con->flags & CON_ENABLED) && con->write && (cpu_online(smp_processor_id()) || (con->flags & CON_ANYTIME))) @@ -544,7 +557,7 @@ static int have_callable_console(void) { struct console *con; - for (con = console_drivers; con; con = con->next) + for_each_console(con) if (con->flags & CON_ANYTIME) return 1; @@ -640,6 +653,20 @@ static int recursion_bug; static int new_text_line = 1; static char printk_buf[1024]; +int printk_delay_msec __read_mostly; + +static inline void printk_delay(void) +{ + if (unlikely(printk_delay_msec)) { + int m = printk_delay_msec; + + while (m--) { + mdelay(1); + touch_nmi_watchdog(); + } + } +} + asmlinkage int vprintk(const char *fmt, va_list args) { int printed_len = 0; @@ -649,6 +676,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) char *p; boot_delay_msec(); + printk_delay(); preempt_disable(); /* This stops the holder of console_sem just where we want him */ @@ -1060,12 +1088,6 @@ void __sched console_conditional_schedule(void) } EXPORT_SYMBOL(console_conditional_schedule); -void console_print(const char *s) -{ - printk(KERN_EMERG "%s", s); -} -EXPORT_SYMBOL(console_print); - void console_unblank(void) { struct console *c; @@ -1082,7 +1104,7 @@ void console_unblank(void) console_locked = 1; console_may_schedule = 0; - for (c = console_drivers; c != NULL; c = c->next) + for_each_console(c) if ((c->flags & CON_ENABLED) && c->unblank) c->unblank(); release_console_sem(); @@ -1097,7 +1119,7 @@ struct tty_driver *console_device(int *index) struct tty_driver *driver = NULL; acquire_console_sem(); - for (c = console_drivers; c != NULL; c = c->next) { + for_each_console(c) { if (!c->device) continue; driver = c->device(c, index); @@ -1134,25 +1156,49 @@ EXPORT_SYMBOL(console_start); * to register the console printing procedure with printk() and to * print any messages that were printed by the kernel before the * console driver was initialized. + * + * This can happen pretty early during the boot process (because of + * early_printk) - sometimes before setup_arch() completes - be careful + * of what kernel features are used - they may not be initialised yet. + * + * There are two types of consoles - bootconsoles (early_printk) and + * "real" consoles (everything which is not a bootconsole) which are + * handled differently. + * - Any number of bootconsoles can be registered at any time. + * - As soon as a "real" console is registered, all bootconsoles + * will be unregistered automatically. + * - Once a "real" console is registered, any attempt to register a + * bootconsoles will be rejected */ -void register_console(struct console *console) +void register_console(struct console *newcon) { int i; unsigned long flags; - struct console *bootconsole = NULL; + struct console *bcon = NULL; - if (console_drivers) { - if (console->flags & CON_BOOT) - return; - if (console_drivers->flags & CON_BOOT) - bootconsole = console_drivers; + /* + * before we register a new CON_BOOT console, make sure we don't + * already have a valid console + */ + if (console_drivers && newcon->flags & CON_BOOT) { + /* find the last or real console */ + for_each_console(bcon) { + if (!(bcon->flags & CON_BOOT)) { + printk(KERN_INFO "Too late to register bootconsole %s%d\n", + newcon->name, newcon->index); + return; + } + } } - if (preferred_console < 0 || bootconsole || !console_drivers) + if (console_drivers && console_drivers->flags & CON_BOOT) + bcon = console_drivers; + + if (preferred_console < 0 || bcon || !console_drivers) preferred_console = selected_console; - if (console->early_setup) - console->early_setup(); + if (newcon->early_setup) + newcon->early_setup(); /* * See if we want to use this console driver. If we @@ -1160,13 +1206,13 @@ void register_console(struct console *console) * that registers here. */ if (preferred_console < 0) { - if (console->index < 0) - console->index = 0; - if (console->setup == NULL || - console->setup(console, NULL) == 0) { - console->flags |= CON_ENABLED; - if (console->device) { - console->flags |= CON_CONSDEV; + if (newcon->index < 0) + newcon->index = 0; + if (newcon->setup == NULL || + newcon->setup(newcon, NULL) == 0) { + newcon->flags |= CON_ENABLED; + if (newcon->device) { + newcon->flags |= CON_CONSDEV; preferred_console = 0; } } @@ -1178,64 +1224,62 @@ void register_console(struct console *console) */ for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) { - if (strcmp(console_cmdline[i].name, console->name) != 0) + if (strcmp(console_cmdline[i].name, newcon->name) != 0) continue; - if (console->index >= 0 && - console->index != console_cmdline[i].index) + if (newcon->index >= 0 && + newcon->index != console_cmdline[i].index) continue; - if (console->index < 0) - console->index = console_cmdline[i].index; + if (newcon->index < 0) + newcon->index = console_cmdline[i].index; #ifdef CONFIG_A11Y_BRAILLE_CONSOLE if (console_cmdline[i].brl_options) { - console->flags |= CON_BRL; - braille_register_console(console, + newcon->flags |= CON_BRL; + braille_register_console(newcon, console_cmdline[i].index, console_cmdline[i].options, console_cmdline[i].brl_options); return; } #endif - if (console->setup && - console->setup(console, console_cmdline[i].options) != 0) + if (newcon->setup && + newcon->setup(newcon, console_cmdline[i].options) != 0) break; - console->flags |= CON_ENABLED; - console->index = console_cmdline[i].index; + newcon->flags |= CON_ENABLED; + newcon->index = console_cmdline[i].index; if (i == selected_console) { - console->flags |= CON_CONSDEV; + newcon->flags |= CON_CONSDEV; preferred_console = selected_console; } break; } - if (!(console->flags & CON_ENABLED)) + if (!(newcon->flags & CON_ENABLED)) return; - if (bootconsole && (console->flags & CON_CONSDEV)) { - printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", - bootconsole->name, bootconsole->index, - console->name, console->index); - unregister_console(bootconsole); - console->flags &= ~CON_PRINTBUFFER; - } else { - printk(KERN_INFO "console [%s%d] enabled\n", - console->name, console->index); - } + /* + * If we have a bootconsole, and are switching to a real console, + * don't print everything out again, since when the boot console, and + * the real console are the same physical device, it's annoying to + * see the beginning boot messages twice + */ + if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) + newcon->flags &= ~CON_PRINTBUFFER; /* * Put this console in the list - keep the * preferred driver at the head of the list. */ acquire_console_sem(); - if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { - console->next = console_drivers; - console_drivers = console; - if (console->next) - console->next->flags &= ~CON_CONSDEV; + if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { + newcon->next = console_drivers; + console_drivers = newcon; + if (newcon->next) + newcon->next->flags &= ~CON_CONSDEV; } else { - console->next = console_drivers->next; - console_drivers->next = console; + newcon->next = console_drivers->next; + console_drivers->next = newcon; } - if (console->flags & CON_PRINTBUFFER) { + if (newcon->flags & CON_PRINTBUFFER) { /* * release_console_sem() will print out the buffered messages * for us. @@ -1245,6 +1289,28 @@ void register_console(struct console *console) spin_unlock_irqrestore(&logbuf_lock, flags); } release_console_sem(); + + /* + * By unregistering the bootconsoles after we enable the real console + * we get the "console xxx enabled" message on all the consoles - + * boot consoles, real consoles, etc - this is to ensure that end + * users know there might be something in the kernel's log buffer that + * went to the bootconsole (that they do not see on the real console) + */ + if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { + /* we need to iterate through twice, to make sure we print + * everything out, before we unregister the console(s) + */ + printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n", + newcon->name, newcon->index); + for_each_console(bcon) + if (bcon->flags & CON_BOOT) + unregister_console(bcon); + } else { + printk(KERN_INFO "%sconsole [%s%d] enabled\n", + (newcon->flags & CON_BOOT) ? "boot" : "" , + newcon->name, newcon->index); + } } EXPORT_SYMBOL(register_console); @@ -1287,11 +1353,13 @@ EXPORT_SYMBOL(unregister_console); static int __init disable_boot_consoles(void) { - if (console_drivers != NULL) { - if (console_drivers->flags & CON_BOOT) { + struct console *con; + + for_each_console(con) { + if (con->flags & CON_BOOT) { printk(KERN_INFO "turn off boot console %s%d\n", - console_drivers->name, console_drivers->index); - return unregister_console(console_drivers); + con->name, con->index); + unregister_console(con); } } return 0; diff --git a/kernel/profile.c b/kernel/profile.c index 419250e..a55d3a3 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -442,48 +442,51 @@ void profile_tick(int type) #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <asm/uaccess.h> -static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) { - int len = cpumask_scnprintf(page, count, data); - if (count - len < 2) - return -EINVAL; - len += sprintf(page + len, "\n"); - return len; + seq_cpumask(m, prof_cpu_mask); + seq_putc(m, '\n'); + return 0; } -static int prof_cpu_mask_write_proc(struct file *file, - const char __user *buffer, unsigned long count, void *data) +static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, prof_cpu_mask_proc_show, NULL); +} + +static ssize_t prof_cpu_mask_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) { - struct cpumask *mask = data; - unsigned long full_count = count, err; cpumask_var_t new_value; + int err; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(buffer, count, new_value); if (!err) { - cpumask_copy(mask, new_value); - err = full_count; + cpumask_copy(prof_cpu_mask, new_value); + err = count; } free_cpumask_var(new_value); return err; } +static const struct file_operations prof_cpu_mask_proc_fops = { + .open = prof_cpu_mask_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = prof_cpu_mask_proc_write, +}; + void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) { - struct proc_dir_entry *entry; - /* create /proc/irq/prof_cpu_mask */ - entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); - if (!entry) - return; - entry->data = prof_cpu_mask; - entry->read_proc = prof_cpu_mask_read_proc; - entry->write_proc = prof_cpu_mask_write_proc; + proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops); } /* diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 082c320..307c285 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -152,7 +152,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; - return security_ptrace_may_access(task, mode); + return security_ptrace_access_check(task, mode); } bool ptrace_may_access(struct task_struct *task, unsigned int mode) diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c deleted file mode 100644 index 0f2b0b3..0000000 --- a/kernel/rcuclassic.c +++ /dev/null @@ -1,807 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2001 - * - * Authors: Dipankar Sarma <dipankar@in.ibm.com> - * Manfred Spraul <manfred@colorfullife.com> - * - * Based on the original work by Paul McKenney <paulmck@us.ibm.com> - * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. - * Papers: - * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf - * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - * - */ -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/spinlock.h> -#include <linux/smp.h> -#include <linux/rcupdate.h> -#include <linux/interrupt.h> -#include <linux/sched.h> -#include <asm/atomic.h> -#include <linux/bitops.h> -#include <linux/module.h> -#include <linux/completion.h> -#include <linux/moduleparam.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/cpu.h> -#include <linux/mutex.h> -#include <linux/time.h> - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); -EXPORT_SYMBOL_GPL(rcu_lock_map); -#endif - - -/* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_ctrlblk = { - .cur = -300, - .completed = -300, - .pending = -300, - .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), - .cpumask = CPU_BITS_NONE, -}; - -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .cur = -300, - .completed = -300, - .pending = -300, - .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), - .cpumask = CPU_BITS_NONE, -}; - -static DEFINE_PER_CPU(struct rcu_data, rcu_data); -static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); - -/* - * Increment the quiescent state counter. - * The counter is a bit degenerated: We do not need to know - * how many quiescent states passed, just if there was at least - * one since the start of the grace period. Thus just a flag. - */ -void rcu_qsctr_inc(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - rdp->passed_quiesc = 1; -} - -void rcu_bh_qsctr_inc(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc = 1; -} - -static int blimit = 10; -static int qhimark = 10000; -static int qlowmark = 100; - -#ifdef CONFIG_SMP -static void force_quiescent_state(struct rcu_data *rdp, - struct rcu_ctrlblk *rcp) -{ - int cpu; - unsigned long flags; - - set_need_resched(); - spin_lock_irqsave(&rcp->lock, flags); - if (unlikely(!rcp->signaled)) { - rcp->signaled = 1; - /* - * Don't send IPI to itself. With irqs disabled, - * rdp->cpu is the current cpu. - * - * cpu_online_mask is updated by the _cpu_down() - * using __stop_machine(). Since we're in irqs disabled - * section, __stop_machine() is not exectuting, hence - * the cpu_online_mask is stable. - * - * However, a cpu might have been offlined _just_ before - * we disabled irqs while entering here. - * And rcu subsystem might not yet have handled the CPU_DEAD - * notification, leading to the offlined cpu's bit - * being set in the rcp->cpumask. - * - * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent - * sending smp_reschedule() to an offlined CPU. - */ - for_each_cpu_and(cpu, - to_cpumask(rcp->cpumask), cpu_online_mask) { - if (cpu != rdp->cpu) - smp_send_reschedule(cpu); - } - } - spin_unlock_irqrestore(&rcp->lock, flags); -} -#else -static inline void force_quiescent_state(struct rcu_data *rdp, - struct rcu_ctrlblk *rcp) -{ - set_need_resched(); -} -#endif - -static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - long batch; - - head->next = NULL; - smp_mb(); /* Read of rcu->cur must happen after any change by caller. */ - - /* - * Determine the batch number of this callback. - * - * Using ACCESS_ONCE to avoid the following error when gcc eliminates - * local variable "batch" and emits codes like this: - * 1) rdp->batch = rcp->cur + 1 # gets old value - * ...... - * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value - * then [*nxttail[0], *nxttail[1]) may contain callbacks - * that batch# = rdp->batch, see the comment of struct rcu_data. - */ - batch = ACCESS_ONCE(rcp->cur) + 1; - - if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) { - /* process callbacks */ - rdp->nxttail[0] = rdp->nxttail[1]; - rdp->nxttail[1] = rdp->nxttail[2]; - if (rcu_batch_after(batch - 1, rdp->batch)) - rdp->nxttail[0] = rdp->nxttail[2]; - } - - rdp->batch = batch; - *rdp->nxttail[2] = head; - rdp->nxttail[2] = &head->next; - - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_ctrlblk); - } -} - -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - -static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) -{ - rcp->gp_start = jiffies; - rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; -} - -static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) -{ - int cpu; - long delta; - unsigned long flags; - - /* Only let one CPU complain about others per time interval. */ - - spin_lock_irqsave(&rcp->lock, flags); - delta = jiffies - rcp->jiffies_stall; - if (delta < 2 || rcp->cur != rcp->completed) { - spin_unlock_irqrestore(&rcp->lock, flags); - return; - } - rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - spin_unlock_irqrestore(&rcp->lock, flags); - - /* OK, time to rat on our buddy... */ - - printk(KERN_ERR "INFO: RCU detected CPU stalls:"); - for_each_possible_cpu(cpu) { - if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask))) - printk(" %d", cpu); - } - printk(" (detected by %d, t=%ld jiffies)\n", - smp_processor_id(), (long)(jiffies - rcp->gp_start)); -} - -static void print_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long flags; - - printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", - smp_processor_id(), jiffies, - jiffies - rcp->gp_start); - dump_stack(); - spin_lock_irqsave(&rcp->lock, flags); - if ((long)(jiffies - rcp->jiffies_stall) >= 0) - rcp->jiffies_stall = - jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - spin_unlock_irqrestore(&rcp->lock, flags); - set_need_resched(); /* kick ourselves to get things going. */ -} - -static void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ - long delta; - - delta = jiffies - rcp->jiffies_stall; - if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) && - delta >= 0) { - - /* We haven't checked in, so go dump stack. */ - print_cpu_stall(rcp); - - } else if (rcp->cur != rcp->completed && delta >= 2) { - - /* They had two seconds to dump stack, so complain. */ - print_other_cpu_stall(rcp); - } -} - -#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - -static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) -{ -} - -static inline void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - -/** - * call_rcu - Queue an RCU callback for invocation after a grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period - * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - */ -void call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - - head->func = func; - local_irq_save(flags); - __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data)); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/** - * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period - * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_bh() assumes - * that the read-side critical sections end on completion of a softirq - * handler. This means that read-side critical sections in process - * context must not be interrupted by softirqs. This interface is to be - * used when most of the read-side critical sections are in softirq context. - * RCU read-side critical sections are delimited by rcu_read_lock() and - * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() - * and rcu_read_unlock_bh(), if in process context. These may be nested. - */ -void call_rcu_bh(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - - head->func = func; - local_irq_save(flags); - __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(call_rcu_bh); - -/* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. - */ -long rcu_batches_completed(void) -{ - return rcu_ctrlblk.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. - */ -long rcu_batches_completed_bh(void) -{ - return rcu_bh_ctrlblk.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); - -/* Raises the softirq for processing rcu_callbacks. */ -static inline void raise_rcu_softirq(void) -{ - raise_softirq(RCU_SOFTIRQ); -} - -/* - * Invoke the completed RCU callbacks. They are expected to be in - * a per-cpu list. - */ -static void rcu_do_batch(struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_head *next, *list; - int count = 0; - - list = rdp->donelist; - while (list) { - next = list->next; - prefetch(next); - list->func(list); - list = next; - if (++count >= rdp->blimit) - break; - } - rdp->donelist = list; - - local_irq_save(flags); - rdp->qlen -= count; - local_irq_restore(flags); - if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) - rdp->blimit = blimit; - - if (!rdp->donelist) - rdp->donetail = &rdp->donelist; - else - raise_rcu_softirq(); -} - -/* - * Grace period handling: - * The grace period handling consists out of two steps: - * - A new grace period is started. - * This is done by rcu_start_batch. The start is not broadcasted to - * all cpus, they must pick this up by comparing rcp->cur with - * rdp->quiescbatch. All cpus are recorded in the - * rcu_ctrlblk.cpumask bitmap. - * - All cpus must go through a quiescent state. - * Since the start of the grace period is not broadcasted, at least two - * calls to rcu_check_quiescent_state are required: - * The first call just notices that a new grace period is running. The - * following calls check if there was a quiescent state since the beginning - * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If - * the bitmap is empty, then the grace period is completed. - * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace - * period (if necessary). - */ - -/* - * Register a new batch of callbacks, and start it up if there is currently no - * active batch and the batch to be registered has not already occurred. - * Caller must hold rcu_ctrlblk.lock. - */ -static void rcu_start_batch(struct rcu_ctrlblk *rcp) -{ - if (rcp->cur != rcp->pending && - rcp->completed == rcp->cur) { - rcp->cur++; - record_gp_stall_check_time(rcp); - - /* - * Accessing nohz_cpu_mask before incrementing rcp->cur needs a - * Barrier Otherwise it can cause tickless idle CPUs to be - * included in rcp->cpumask, which will extend graceperiods - * unnecessarily. - */ - smp_mb(); - cpumask_andnot(to_cpumask(rcp->cpumask), - cpu_online_mask, nohz_cpu_mask); - - rcp->signaled = 0; - } -} - -/* - * cpu went through a quiescent state since the beginning of the grace period. - * Clear it from the cpu mask and complete the grace period if it was the last - * cpu. Start another grace period if someone has further entries pending - */ -static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) -{ - cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask)); - if (cpumask_empty(to_cpumask(rcp->cpumask))) { - /* batch completed ! */ - rcp->completed = rcp->cur; - rcu_start_batch(rcp); - } -} - -/* - * Check if the cpu has gone through a quiescent state (say context - * switch). If so and if it already hasn't done so in this RCU - * quiescent cycle, then indicate that it has done so. - */ -static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - unsigned long flags; - - if (rdp->quiescbatch != rcp->cur) { - /* start new grace period: */ - rdp->qs_pending = 1; - rdp->passed_quiesc = 0; - rdp->quiescbatch = rcp->cur; - return; - } - - /* Grace period already completed for this cpu? - * qs_pending is checked instead of the actual bitmap to avoid - * cacheline trashing. - */ - if (!rdp->qs_pending) - return; - - /* - * Was there a quiescent state since the beginning of the grace - * period? If no, then exit and wait for the next call. - */ - if (!rdp->passed_quiesc) - return; - rdp->qs_pending = 0; - - spin_lock_irqsave(&rcp->lock, flags); - /* - * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync - * during cpu startup. Ignore the quiescent state. - */ - if (likely(rdp->quiescbatch == rcp->cur)) - cpu_quiet(rdp->cpu, rcp); - - spin_unlock_irqrestore(&rcp->lock, flags); -} - - -#ifdef CONFIG_HOTPLUG_CPU - -/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing - * locking requirements, the list it's pulling from has to belong to a cpu - * which is dead and hence not processing interrupts. - */ -static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, - struct rcu_head **tail, long batch) -{ - unsigned long flags; - - if (list) { - local_irq_save(flags); - this_rdp->batch = batch; - *this_rdp->nxttail[2] = list; - this_rdp->nxttail[2] = tail; - local_irq_restore(flags); - } -} - -static void __rcu_offline_cpu(struct rcu_data *this_rdp, - struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - unsigned long flags; - - /* - * if the cpu going offline owns the grace period - * we can block indefinitely waiting for it, so flush - * it here - */ - spin_lock_irqsave(&rcp->lock, flags); - if (rcp->cur != rcp->completed) - cpu_quiet(rdp->cpu, rcp); - rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); - rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); - spin_unlock(&rcp->lock); - - this_rdp->qlen += rdp->qlen; - local_irq_restore(flags); -} - -static void rcu_offline_cpu(int cpu) -{ - struct rcu_data *this_rdp = &get_cpu_var(rcu_data); - struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); - - __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, - &per_cpu(rcu_data, cpu)); - __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, - &per_cpu(rcu_bh_data, cpu)); - put_cpu_var(rcu_data); - put_cpu_var(rcu_bh_data); -} - -#else - -static void rcu_offline_cpu(int cpu) -{ -} - -#endif - -/* - * This does the RCU processing work from softirq context. - */ -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - unsigned long flags; - long completed_snap; - - if (rdp->nxtlist) { - local_irq_save(flags); - completed_snap = ACCESS_ONCE(rcp->completed); - - /* - * move the other grace-period-completed entries to - * [rdp->nxtlist, *rdp->nxttail[0]) temporarily - */ - if (!rcu_batch_before(completed_snap, rdp->batch)) - rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2]; - else if (!rcu_batch_before(completed_snap, rdp->batch - 1)) - rdp->nxttail[0] = rdp->nxttail[1]; - - /* - * the grace period for entries in - * [rdp->nxtlist, *rdp->nxttail[0]) has completed and - * move these entries to donelist - */ - if (rdp->nxttail[0] != &rdp->nxtlist) { - *rdp->donetail = rdp->nxtlist; - rdp->donetail = rdp->nxttail[0]; - rdp->nxtlist = *rdp->nxttail[0]; - *rdp->donetail = NULL; - - if (rdp->nxttail[1] == rdp->nxttail[0]) - rdp->nxttail[1] = &rdp->nxtlist; - if (rdp->nxttail[2] == rdp->nxttail[0]) - rdp->nxttail[2] = &rdp->nxtlist; - rdp->nxttail[0] = &rdp->nxtlist; - } - - local_irq_restore(flags); - - if (rcu_batch_after(rdp->batch, rcp->pending)) { - unsigned long flags2; - - /* and start it/schedule start if it's a new batch */ - spin_lock_irqsave(&rcp->lock, flags2); - if (rcu_batch_after(rdp->batch, rcp->pending)) { - rcp->pending = rdp->batch; - rcu_start_batch(rcp); - } - spin_unlock_irqrestore(&rcp->lock, flags2); - } - } - - rcu_check_quiescent_state(rcp, rdp); - if (rdp->donelist) - rcu_do_batch(rdp); -} - -static void rcu_process_callbacks(struct softirq_action *unused) -{ - /* - * Memory references from any prior RCU read-side critical sections - * executed by the interrupted code must be see before any RCU - * grace-period manupulations below. - */ - - smp_mb(); /* See above block comment. */ - - __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); - __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); - - /* - * Memory references from any later RCU read-side critical sections - * executed by the interrupted code must be see after any RCU - * grace-period manupulations above. - */ - - smp_mb(); /* See above block comment. */ -} - -static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - /* Check for CPU stalls, if enabled. */ - check_cpu_stall(rcp); - - if (rdp->nxtlist) { - long completed_snap = ACCESS_ONCE(rcp->completed); - - /* - * This cpu has pending rcu entries and the grace period - * for them has completed. - */ - if (!rcu_batch_before(completed_snap, rdp->batch)) - return 1; - if (!rcu_batch_before(completed_snap, rdp->batch - 1) && - rdp->nxttail[0] != rdp->nxttail[1]) - return 1; - if (rdp->nxttail[0] != &rdp->nxtlist) - return 1; - - /* - * This cpu has pending rcu entries and the new batch - * for then hasn't been started nor scheduled start - */ - if (rcu_batch_after(rdp->batch, rcp->pending)) - return 1; - } - - /* This cpu has finished callbacks to invoke */ - if (rdp->donelist) - return 1; - - /* The rcu core waits for a quiescent state from the cpu */ - if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) - return 1; - - /* nothing to do */ - return 0; -} - -/* - * Check to see if there is any immediate RCU-related work to be done - * by the current CPU, returning 1 if so. This function is part of the - * RCU implementation; it is -not- an exported member of the RCU API. - */ -int rcu_pending(int cpu) -{ - return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || - __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); -} - -/* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. - */ -int rcu_needs_cpu(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); - - return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu); -} - -/* - * Top-level function driving RCU grace-period detection, normally - * invoked from the scheduler-clock interrupt. This function simply - * increments counters that are read only from softirq by this same - * CPU, so there are no memory barriers required. - */ -void rcu_check_callbacks(int cpu, int user) -{ - if (user || - (idle_cpu(cpu) && rcu_scheduler_active && - !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { - - /* - * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so count it. - * - * Also do a memory barrier. This is needed to handle - * the case where writes from a preempt-disable section - * of code get reordered into schedule() by this CPU's - * write buffer. The memory barrier makes sure that - * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see - * by other CPUs to happen after any such write. - */ - - smp_mb(); /* See above block comment. */ - rcu_qsctr_inc(cpu); - rcu_bh_qsctr_inc(cpu); - - } else if (!in_softirq()) { - - /* - * Get here if this CPU did not take its interrupt from - * softirq, in other words, if it is not interrupting - * a rcu_bh read-side critical section. This is an _bh - * critical section, so count it. The memory barrier - * is needed for the same reason as is the above one. - */ - - smp_mb(); /* See above block comment. */ - rcu_bh_qsctr_inc(cpu); - } - raise_rcu_softirq(); -} - -static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - unsigned long flags; - - spin_lock_irqsave(&rcp->lock, flags); - memset(rdp, 0, sizeof(*rdp)); - rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; - rdp->donetail = &rdp->donelist; - rdp->quiescbatch = rcp->completed; - rdp->qs_pending = 0; - rdp->cpu = cpu; - rdp->blimit = blimit; - spin_unlock_irqrestore(&rcp->lock, flags); -} - -static void __cpuinit rcu_online_cpu(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); - - rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); - rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); -} - -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - rcu_online_cpu(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - rcu_offline_cpu(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata rcu_nb = { - .notifier_call = rcu_cpu_notify, -}; - -/* - * Initializes rcu mechanism. Assumed to be called early. - * That is before local timer(SMP) or jiffie timer (uniproc) is setup. - * Note that rcu_qsctr and friends are implicitly - * initialized due to the choice of ``0'' for RCU_CTR_INVALID. - */ -void __init __rcu_init(void) -{ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - /* Register notifier for non-boot CPUs */ - register_cpu_notifier(&rcu_nb); -} - -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a967c9f..37ac454 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -19,7 +19,7 @@ * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> - * + * * Based on the original work by Paul McKenney <paulmck@us.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: @@ -27,7 +27,7 @@ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) * * For detailed explanation of Read-Copy Update mechanism see - - * http://lse.sourceforge.net/locking/rcupdate.html + * http://lse.sourceforge.net/locking/rcupdate.html * */ #include <linux/types.h> @@ -74,6 +74,8 @@ void wakeme_after_rcu(struct rcu_head *head) complete(&rcu->completion); } +#ifdef CONFIG_TREE_PREEMPT_RCU + /** * synchronize_rcu - wait until a grace period has elapsed. * @@ -87,7 +89,7 @@ void synchronize_rcu(void) { struct rcu_synchronize rcu; - if (rcu_blocking_is_gp()) + if (!rcu_scheduler_active) return; init_completion(&rcu.completion); @@ -98,6 +100,70 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +/** + * synchronize_sched - wait until an rcu-sched grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-sched + * grace period has elapsed, in other words after all currently executing + * rcu-sched read-side critical sections have completed. These read-side + * critical sections are delimited by rcu_read_lock_sched() and + * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), + * local_irq_disable(), and so on may be used in place of + * rcu_read_lock_sched(). + * + * This means that all preempt_disable code sequences, including NMI and + * hardware-interrupt handlers, in progress on entry will have completed + * before this primitive returns. However, this does not guarantee that + * softirq handlers will have completed, since in some kernels, these + * handlers can run in process context, and can block. + * + * This primitive provides the guarantees made by the (now removed) + * synchronize_kernel() API. In contrast, synchronize_rcu() only + * guarantees that rcu_read_lock() sections will have completed. + * In "classic RCU", these two guarantees happen to be one and + * the same, but can differ in realtime RCU implementations. + */ +void synchronize_sched(void) +{ + struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_sched(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_sched); + +/** + * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. + * + * Control will return to the caller some time after a full rcu_bh grace + * period has elapsed, in other words after all currently executing rcu_bh + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), + * and may be nested. + */ +void synchronize_rcu_bh(void) +{ + struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_bh(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_bh); + static void rcu_barrier_callback(struct rcu_head *notused) { if (atomic_dec_and_test(&rcu_barrier_cpu_count)) @@ -129,6 +195,7 @@ static void rcu_barrier_func(void *type) static inline void wait_migrated_callbacks(void) { wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); + smp_mb(); /* In case we didn't sleep. */ } /* @@ -192,9 +259,13 @@ static void rcu_migrate_callback(struct rcu_head *notused) wake_up(&rcu_migrate_wq); } +extern int rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu); + static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, unsigned long action, void *hcpu) { + rcu_cpu_notify(self, action, hcpu); if (action == CPU_DYING) { /* * preempt_disable() in on_each_cpu() prevents stop_machine(), @@ -209,7 +280,8 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); - } else if (action == CPU_POST_DEAD) { + } else if (action == CPU_DOWN_PREPARE) { + /* Don't need to wait until next removal operation. */ /* rcu_migrate_head is protected by cpu_add_remove_lock */ wait_migrated_callbacks(); } @@ -219,8 +291,18 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, void __init rcu_init(void) { + int i; + __rcu_init(); - hotcpu_notifier(rcu_barrier_cpu_hotplug, 0); + cpu_notifier(rcu_barrier_cpu_hotplug, 0); + + /* + * We don't need protection against CPU-hotplug here because + * this is called early in boot, before either interrupts + * or the scheduler are operational. + */ + for_each_online_cpu(i) + rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i); } void rcu_scheduler_starting(void) diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c deleted file mode 100644 index beb0e65..0000000 --- a/kernel/rcupreempt.c +++ /dev/null @@ -1,1539 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion, realtime implementation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2006 - * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> - * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar - * for pushing me away from locks and towards counters, and - * to Suparna Bhattacharya for pushing me completely away - * from atomic instructions on the read side. - * - * - Added handling of Dynamic Ticks - * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com> - * - Steven Rostedt <srostedt@redhat.com> - * - * Papers: http://www.rdrop.com/users/paulmck/RCU - * - * Design Document: http://lwn.net/Articles/253651/ - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt - * - */ -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/spinlock.h> -#include <linux/smp.h> -#include <linux/rcupdate.h> -#include <linux/interrupt.h> -#include <linux/sched.h> -#include <asm/atomic.h> -#include <linux/bitops.h> -#include <linux/module.h> -#include <linux/kthread.h> -#include <linux/completion.h> -#include <linux/moduleparam.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/cpu.h> -#include <linux/random.h> -#include <linux/delay.h> -#include <linux/cpumask.h> -#include <linux/rcupreempt_trace.h> -#include <asm/byteorder.h> - -/* - * PREEMPT_RCU data structures. - */ - -/* - * GP_STAGES specifies the number of times the state machine has - * to go through the all the rcu_try_flip_states (see below) - * in a single Grace Period. - * - * GP in GP_STAGES stands for Grace Period ;) - */ -#define GP_STAGES 2 -struct rcu_data { - spinlock_t lock; /* Protect rcu_data fields. */ - long completed; /* Number of last completed batch. */ - int waitlistcount; - struct rcu_head *nextlist; - struct rcu_head **nexttail; - struct rcu_head *waitlist[GP_STAGES]; - struct rcu_head **waittail[GP_STAGES]; - struct rcu_head *donelist; /* from waitlist & waitschedlist */ - struct rcu_head **donetail; - long rcu_flipctr[2]; - struct rcu_head *nextschedlist; - struct rcu_head **nextschedtail; - struct rcu_head *waitschedlist; - struct rcu_head **waitschedtail; - int rcu_sched_sleeping; -#ifdef CONFIG_RCU_TRACE - struct rcupreempt_trace trace; -#endif /* #ifdef CONFIG_RCU_TRACE */ -}; - -/* - * States for rcu_try_flip() and friends. - */ - -enum rcu_try_flip_states { - - /* - * Stay here if nothing is happening. Flip the counter if somthing - * starts happening. Denoted by "I" - */ - rcu_try_flip_idle_state, - - /* - * Wait here for all CPUs to notice that the counter has flipped. This - * prevents the old set of counters from ever being incremented once - * we leave this state, which in turn is necessary because we cannot - * test any individual counter for zero -- we can only check the sum. - * Denoted by "A". - */ - rcu_try_flip_waitack_state, - - /* - * Wait here for the sum of the old per-CPU counters to reach zero. - * Denoted by "Z". - */ - rcu_try_flip_waitzero_state, - - /* - * Wait here for each of the other CPUs to execute a memory barrier. - * This is necessary to ensure that these other CPUs really have - * completed executing their RCU read-side critical sections, despite - * their CPUs wildly reordering memory. Denoted by "M". - */ - rcu_try_flip_waitmb_state, -}; - -/* - * States for rcu_ctrlblk.rcu_sched_sleep. - */ - -enum rcu_sched_sleep_states { - rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */ - rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */ - rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */ -}; - -struct rcu_ctrlblk { - spinlock_t fliplock; /* Protect state-machine transitions. */ - long completed; /* Number of last completed batch. */ - enum rcu_try_flip_states rcu_try_flip_state; /* The current state of - the rcu state machine */ - spinlock_t schedlock; /* Protect rcu_sched sleep state. */ - enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */ - wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */ -}; - -struct rcu_dyntick_sched { - int dynticks; - int dynticks_snap; - int sched_qs; - int sched_qs_snap; - int sched_dynticks_snap; -}; - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = { - .dynticks = 1, -}; - -void rcu_qsctr_inc(int cpu) -{ - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - rdssp->sched_qs++; -} - -#ifdef CONFIG_NO_HZ - -void rcu_enter_nohz(void) -{ - static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1); - - smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ - __get_cpu_var(rcu_dyntick_sched).dynticks++; - WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs); -} - -void rcu_exit_nohz(void) -{ - static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1); - - __get_cpu_var(rcu_dyntick_sched).dynticks++; - smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ - WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1), - &rs); -} - -#endif /* CONFIG_NO_HZ */ - - -static DEFINE_PER_CPU(struct rcu_data, rcu_data); - -static struct rcu_ctrlblk rcu_ctrlblk = { - .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), - .completed = 0, - .rcu_try_flip_state = rcu_try_flip_idle_state, - .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock), - .sched_sleep = rcu_sched_not_sleeping, - .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq), -}; - -static struct task_struct *rcu_sched_grace_period_task; - -#ifdef CONFIG_RCU_TRACE -static char *rcu_try_flip_state_names[] = - { "idle", "waitack", "waitzero", "waitmb" }; -#endif /* #ifdef CONFIG_RCU_TRACE */ - -static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly - = CPU_BITS_NONE; - -/* - * Enum and per-CPU flag to determine when each CPU has seen - * the most recent counter flip. - */ - -enum rcu_flip_flag_values { - rcu_flip_seen, /* Steady/initial state, last flip seen. */ - /* Only GP detector can update. */ - rcu_flipped /* Flip just completed, need confirmation. */ - /* Only corresponding CPU can update. */ -}; -static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag) - = rcu_flip_seen; - -/* - * Enum and per-CPU flag to determine when each CPU has executed the - * needed memory barrier to fence in memory references from its last RCU - * read-side critical section in the just-completed grace period. - */ - -enum rcu_mb_flag_values { - rcu_mb_done, /* Steady/initial state, no mb()s required. */ - /* Only GP detector can update. */ - rcu_mb_needed /* Flip just completed, need an mb(). */ - /* Only corresponding CPU can update. */ -}; -static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag) - = rcu_mb_done; - -/* - * RCU_DATA_ME: find the current CPU's rcu_data structure. - * RCU_DATA_CPU: find the specified CPU's rcu_data structure. - */ -#define RCU_DATA_ME() (&__get_cpu_var(rcu_data)) -#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu)) - -/* - * Helper macro for tracing when the appropriate rcu_data is not - * cached in a local variable, but where the CPU number is so cached. - */ -#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace)); - -/* - * Helper macro for tracing when the appropriate rcu_data is not - * cached in a local variable. - */ -#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace)); - -/* - * Helper macro for tracing when the appropriate rcu_data is pointed - * to by a local variable. - */ -#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); - -#define RCU_SCHED_BATCH_TIME (HZ / 50) - -/* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. - */ -long rcu_batches_completed(void) -{ - return rcu_ctrlblk.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -void __rcu_read_lock(void) -{ - int idx; - struct task_struct *t = current; - int nesting; - - nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); - if (nesting != 0) { - - /* An earlier rcu_read_lock() covers us, just count it. */ - - t->rcu_read_lock_nesting = nesting + 1; - - } else { - unsigned long flags; - - /* - * We disable interrupts for the following reasons: - * - If we get scheduling clock interrupt here, and we - * end up acking the counter flip, it's like a promise - * that we will never increment the old counter again. - * Thus we will break that promise if that - * scheduling clock interrupt happens between the time - * we pick the .completed field and the time that we - * increment our counter. - * - * - We don't want to be preempted out here. - * - * NMIs can still occur, of course, and might themselves - * contain rcu_read_lock(). - */ - - local_irq_save(flags); - - /* - * Outermost nesting of rcu_read_lock(), so increment - * the current counter for the current CPU. Use volatile - * casts to prevent the compiler from reordering. - */ - - idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1; - ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++; - - /* - * Now that the per-CPU counter has been incremented, we - * are protected from races with rcu_read_lock() invoked - * from NMI handlers on this CPU. We can therefore safely - * increment the nesting counter, relieving further NMIs - * of the need to increment the per-CPU counter. - */ - - ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1; - - /* - * Now that we have preventing any NMIs from storing - * to the ->rcu_flipctr_idx, we can safely use it to - * remember which counter to decrement in the matching - * rcu_read_unlock(). - */ - - ACCESS_ONCE(t->rcu_flipctr_idx) = idx; - local_irq_restore(flags); - } -} -EXPORT_SYMBOL_GPL(__rcu_read_lock); - -void __rcu_read_unlock(void) -{ - int idx; - struct task_struct *t = current; - int nesting; - - nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); - if (nesting > 1) { - - /* - * We are still protected by the enclosing rcu_read_lock(), - * so simply decrement the counter. - */ - - t->rcu_read_lock_nesting = nesting - 1; - - } else { - unsigned long flags; - - /* - * Disable local interrupts to prevent the grace-period - * detection state machine from seeing us half-done. - * NMIs can still occur, of course, and might themselves - * contain rcu_read_lock() and rcu_read_unlock(). - */ - - local_irq_save(flags); - - /* - * Outermost nesting of rcu_read_unlock(), so we must - * decrement the current counter for the current CPU. - * This must be done carefully, because NMIs can - * occur at any point in this code, and any rcu_read_lock() - * and rcu_read_unlock() pairs in the NMI handlers - * must interact non-destructively with this code. - * Lots of volatile casts, and -very- careful ordering. - * - * Changes to this code, including this one, must be - * inspected, validated, and tested extremely carefully!!! - */ - - /* - * First, pick up the index. - */ - - idx = ACCESS_ONCE(t->rcu_flipctr_idx); - - /* - * Now that we have fetched the counter index, it is - * safe to decrement the per-task RCU nesting counter. - * After this, any interrupts or NMIs will increment and - * decrement the per-CPU counters. - */ - ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1; - - /* - * It is now safe to decrement this task's nesting count. - * NMIs that occur after this statement will route their - * rcu_read_lock() calls through this "else" clause, and - * will thus start incrementing the per-CPU counter on - * their own. They will also clobber ->rcu_flipctr_idx, - * but that is OK, since we have already fetched it. - */ - - ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--; - local_irq_restore(flags); - } -} -EXPORT_SYMBOL_GPL(__rcu_read_unlock); - -/* - * If a global counter flip has occurred since the last time that we - * advanced callbacks, advance them. Hardware interrupts must be - * disabled when calling this function. - */ -static void __rcu_advance_callbacks(struct rcu_data *rdp) -{ - int cpu; - int i; - int wlc = 0; - - if (rdp->completed != rcu_ctrlblk.completed) { - if (rdp->waitlist[GP_STAGES - 1] != NULL) { - *rdp->donetail = rdp->waitlist[GP_STAGES - 1]; - rdp->donetail = rdp->waittail[GP_STAGES - 1]; - RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp); - } - for (i = GP_STAGES - 2; i >= 0; i--) { - if (rdp->waitlist[i] != NULL) { - rdp->waitlist[i + 1] = rdp->waitlist[i]; - rdp->waittail[i + 1] = rdp->waittail[i]; - wlc++; - } else { - rdp->waitlist[i + 1] = NULL; - rdp->waittail[i + 1] = - &rdp->waitlist[i + 1]; - } - } - if (rdp->nextlist != NULL) { - rdp->waitlist[0] = rdp->nextlist; - rdp->waittail[0] = rdp->nexttail; - wlc++; - rdp->nextlist = NULL; - rdp->nexttail = &rdp->nextlist; - RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp); - } else { - rdp->waitlist[0] = NULL; - rdp->waittail[0] = &rdp->waitlist[0]; - } - rdp->waitlistcount = wlc; - rdp->completed = rcu_ctrlblk.completed; - } - - /* - * Check to see if this CPU needs to report that it has seen - * the most recent counter flip, thereby declaring that all - * subsequent rcu_read_lock() invocations will respect this flip. - */ - - cpu = raw_smp_processor_id(); - if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { - smp_mb(); /* Subsequent counter accesses must see new value */ - per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; - smp_mb(); /* Subsequent RCU read-side critical sections */ - /* seen -after- acknowledgement. */ - } -} - -#ifdef CONFIG_NO_HZ -static DEFINE_PER_CPU(int, rcu_update_flag); - -/** - * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. - * - * If the CPU was idle with dynamic ticks active, this updates the - * rcu_dyntick_sched.dynticks to let the RCU handling know that the - * CPU is active. - */ -void rcu_irq_enter(void) -{ - int cpu = smp_processor_id(); - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - if (per_cpu(rcu_update_flag, cpu)) - per_cpu(rcu_update_flag, cpu)++; - - /* - * Only update if we are coming from a stopped ticks mode - * (rcu_dyntick_sched.dynticks is even). - */ - if (!in_interrupt() && - (rdssp->dynticks & 0x1) == 0) { - /* - * The following might seem like we could have a race - * with NMI/SMIs. But this really isn't a problem. - * Here we do a read/modify/write, and the race happens - * when an NMI/SMI comes in after the read and before - * the write. But NMI/SMIs will increment this counter - * twice before returning, so the zero bit will not - * be corrupted by the NMI/SMI which is the most important - * part. - * - * The only thing is that we would bring back the counter - * to a postion that it was in during the NMI/SMI. - * But the zero bit would be set, so the rest of the - * counter would again be ignored. - * - * On return from the IRQ, the counter may have the zero - * bit be 0 and the counter the same as the return from - * the NMI/SMI. If the state machine was so unlucky to - * see that, it still doesn't matter, since all - * RCU read-side critical sections on this CPU would - * have already completed. - */ - rdssp->dynticks++; - /* - * The following memory barrier ensures that any - * rcu_read_lock() primitives in the irq handler - * are seen by other CPUs to follow the above - * increment to rcu_dyntick_sched.dynticks. This is - * required in order for other CPUs to correctly - * determine when it is safe to advance the RCU - * grace-period state machine. - */ - smp_mb(); /* see above block comment. */ - /* - * Since we can't determine the dynamic tick mode from - * the rcu_dyntick_sched.dynticks after this routine, - * we use a second flag to acknowledge that we came - * from an idle state with ticks stopped. - */ - per_cpu(rcu_update_flag, cpu)++; - /* - * If we take an NMI/SMI now, they will also increment - * the rcu_update_flag, and will not update the - * rcu_dyntick_sched.dynticks on exit. That is for - * this IRQ to do. - */ - } -} - -/** - * rcu_irq_exit - Called from exiting Hard irq context. - * - * If the CPU was idle with dynamic ticks active, update the - * rcu_dyntick_sched.dynticks to put let the RCU handling be - * aware that the CPU is going back to idle with no ticks. - */ -void rcu_irq_exit(void) -{ - int cpu = smp_processor_id(); - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - /* - * rcu_update_flag is set if we interrupted the CPU - * when it was idle with ticks stopped. - * Once this occurs, we keep track of interrupt nesting - * because a NMI/SMI could also come in, and we still - * only want the IRQ that started the increment of the - * rcu_dyntick_sched.dynticks to be the one that modifies - * it on exit. - */ - if (per_cpu(rcu_update_flag, cpu)) { - if (--per_cpu(rcu_update_flag, cpu)) - return; - - /* This must match the interrupt nesting */ - WARN_ON(in_interrupt()); - - /* - * If an NMI/SMI happens now we are still - * protected by the rcu_dyntick_sched.dynticks being odd. - */ - - /* - * The following memory barrier ensures that any - * rcu_read_unlock() primitives in the irq handler - * are seen by other CPUs to preceed the following - * increment to rcu_dyntick_sched.dynticks. This - * is required in order for other CPUs to determine - * when it is safe to advance the RCU grace-period - * state machine. - */ - smp_mb(); /* see above block comment. */ - rdssp->dynticks++; - WARN_ON(rdssp->dynticks & 0x1); - } -} - -void rcu_nmi_enter(void) -{ - rcu_irq_enter(); -} - -void rcu_nmi_exit(void) -{ - rcu_irq_exit(); -} - -static void dyntick_save_progress_counter(int cpu) -{ - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - rdssp->dynticks_snap = rdssp->dynticks; -} - -static inline int -rcu_try_flip_waitack_needed(int cpu) -{ - long curr; - long snap; - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - curr = rdssp->dynticks; - snap = rdssp->dynticks_snap; - smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ - - /* - * If the CPU remained in dynticks mode for the entire time - * and didn't take any interrupts, NMIs, SMIs, or whatever, - * then it cannot be in the middle of an rcu_read_lock(), so - * the next rcu_read_lock() it executes must use the new value - * of the counter. So we can safely pretend that this CPU - * already acknowledged the counter. - */ - - if ((curr == snap) && ((curr & 0x1) == 0)) - return 0; - - /* - * If the CPU passed through or entered a dynticks idle phase with - * no active irq handlers, then, as above, we can safely pretend - * that this CPU already acknowledged the counter. - */ - - if ((curr - snap) > 2 || (curr & 0x1) == 0) - return 0; - - /* We need this CPU to explicitly acknowledge the counter flip. */ - - return 1; -} - -static inline int -rcu_try_flip_waitmb_needed(int cpu) -{ - long curr; - long snap; - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - curr = rdssp->dynticks; - snap = rdssp->dynticks_snap; - smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ - - /* - * If the CPU remained in dynticks mode for the entire time - * and didn't take any interrupts, NMIs, SMIs, or whatever, - * then it cannot have executed an RCU read-side critical section - * during that time, so there is no need for it to execute a - * memory barrier. - */ - - if ((curr == snap) && ((curr & 0x1) == 0)) - return 0; - - /* - * If the CPU either entered or exited an outermost interrupt, - * SMI, NMI, or whatever handler, then we know that it executed - * a memory barrier when doing so. So we don't need another one. - */ - if (curr != snap) - return 0; - - /* We need the CPU to execute a memory barrier. */ - - return 1; -} - -static void dyntick_save_progress_counter_sched(int cpu) -{ - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - rdssp->sched_dynticks_snap = rdssp->dynticks; -} - -static int rcu_qsctr_inc_needed_dyntick(int cpu) -{ - long curr; - long snap; - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - curr = rdssp->dynticks; - snap = rdssp->sched_dynticks_snap; - smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ - - /* - * If the CPU remained in dynticks mode for the entire time - * and didn't take any interrupts, NMIs, SMIs, or whatever, - * then it cannot be in the middle of an rcu_read_lock(), so - * the next rcu_read_lock() it executes must use the new value - * of the counter. Therefore, this CPU has been in a quiescent - * state the entire time, and we don't need to wait for it. - */ - - if ((curr == snap) && ((curr & 0x1) == 0)) - return 0; - - /* - * If the CPU passed through or entered a dynticks idle phase with - * no active irq handlers, then, as above, this CPU has already - * passed through a quiescent state. - */ - - if ((curr - snap) > 2 || (snap & 0x1) == 0) - return 0; - - /* We need this CPU to go through a quiescent state. */ - - return 1; -} - -#else /* !CONFIG_NO_HZ */ - -# define dyntick_save_progress_counter(cpu) do { } while (0) -# define rcu_try_flip_waitack_needed(cpu) (1) -# define rcu_try_flip_waitmb_needed(cpu) (1) - -# define dyntick_save_progress_counter_sched(cpu) do { } while (0) -# define rcu_qsctr_inc_needed_dyntick(cpu) (1) - -#endif /* CONFIG_NO_HZ */ - -static void save_qsctr_sched(int cpu) -{ - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - rdssp->sched_qs_snap = rdssp->sched_qs; -} - -static inline int rcu_qsctr_inc_needed(int cpu) -{ - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - /* - * If there has been a quiescent state, no more need to wait - * on this CPU. - */ - - if (rdssp->sched_qs != rdssp->sched_qs_snap) { - smp_mb(); /* force ordering with cpu entering schedule(). */ - return 0; - } - - /* We need this CPU to go through a quiescent state. */ - - return 1; -} - -/* - * Get here when RCU is idle. Decide whether we need to - * move out of idle state, and return non-zero if so. - * "Straightforward" approach for the moment, might later - * use callback-list lengths, grace-period duration, or - * some such to determine when to exit idle state. - * Might also need a pre-idle test that does not acquire - * the lock, but let's get the simple case working first... - */ - -static int -rcu_try_flip_idle(void) -{ - int cpu; - - RCU_TRACE_ME(rcupreempt_trace_try_flip_i1); - if (!rcu_pending(smp_processor_id())) { - RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1); - return 0; - } - - /* - * Do the flip. - */ - - RCU_TRACE_ME(rcupreempt_trace_try_flip_g1); - rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */ - - /* - * Need a memory barrier so that other CPUs see the new - * counter value before they see the subsequent change of all - * the rcu_flip_flag instances to rcu_flipped. - */ - - smp_mb(); /* see above block comment. */ - - /* Now ask each CPU for acknowledgement of the flip. */ - - for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) { - per_cpu(rcu_flip_flag, cpu) = rcu_flipped; - dyntick_save_progress_counter(cpu); - } - - return 1; -} - -/* - * Wait for CPUs to acknowledge the flip. - */ - -static int -rcu_try_flip_waitack(void) -{ - int cpu; - - RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); - for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) - if (rcu_try_flip_waitack_needed(cpu) && - per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { - RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); - return 0; - } - - /* - * Make sure our checks above don't bleed into subsequent - * waiting for the sum of the counters to reach zero. - */ - - smp_mb(); /* see above block comment. */ - RCU_TRACE_ME(rcupreempt_trace_try_flip_a2); - return 1; -} - -/* - * Wait for collective ``last'' counter to reach zero, - * then tell all CPUs to do an end-of-grace-period memory barrier. - */ - -static int -rcu_try_flip_waitzero(void) -{ - int cpu; - int lastidx = !(rcu_ctrlblk.completed & 0x1); - int sum = 0; - - /* Check to see if the sum of the "last" counters is zero. */ - - RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); - for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) - sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; - if (sum != 0) { - RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); - return 0; - } - - /* - * This ensures that the other CPUs see the call for - * memory barriers -after- the sum to zero has been - * detected here - */ - smp_mb(); /* ^^^^^^^^^^^^ */ - - /* Call for a memory barrier from each CPU. */ - for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) { - per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; - dyntick_save_progress_counter(cpu); - } - - RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); - return 1; -} - -/* - * Wait for all CPUs to do their end-of-grace-period memory barrier. - * Return 0 once all CPUs have done so. - */ - -static int -rcu_try_flip_waitmb(void) -{ - int cpu; - - RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); - for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) - if (rcu_try_flip_waitmb_needed(cpu) && - per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { - RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); - return 0; - } - - smp_mb(); /* Ensure that the above checks precede any following flip. */ - RCU_TRACE_ME(rcupreempt_trace_try_flip_m2); - return 1; -} - -/* - * Attempt a single flip of the counters. Remember, a single flip does - * -not- constitute a grace period. Instead, the interval between - * at least GP_STAGES consecutive flips is a grace period. - * - * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation - * on a large SMP, they might want to use a hierarchical organization of - * the per-CPU-counter pairs. - */ -static void rcu_try_flip(void) -{ - unsigned long flags; - - RCU_TRACE_ME(rcupreempt_trace_try_flip_1); - if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) { - RCU_TRACE_ME(rcupreempt_trace_try_flip_e1); - return; - } - - /* - * Take the next transition(s) through the RCU grace-period - * flip-counter state machine. - */ - - switch (rcu_ctrlblk.rcu_try_flip_state) { - case rcu_try_flip_idle_state: - if (rcu_try_flip_idle()) - rcu_ctrlblk.rcu_try_flip_state = - rcu_try_flip_waitack_state; - break; - case rcu_try_flip_waitack_state: - if (rcu_try_flip_waitack()) - rcu_ctrlblk.rcu_try_flip_state = - rcu_try_flip_waitzero_state; - break; - case rcu_try_flip_waitzero_state: - if (rcu_try_flip_waitzero()) - rcu_ctrlblk.rcu_try_flip_state = - rcu_try_flip_waitmb_state; - break; - case rcu_try_flip_waitmb_state: - if (rcu_try_flip_waitmb()) - rcu_ctrlblk.rcu_try_flip_state = - rcu_try_flip_idle_state; - } - spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); -} - -/* - * Check to see if this CPU needs to do a memory barrier in order to - * ensure that any prior RCU read-side critical sections have committed - * their counter manipulations and critical-section memory references - * before declaring the grace period to be completed. - */ -static void rcu_check_mb(int cpu) -{ - if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) { - smp_mb(); /* Ensure RCU read-side accesses are visible. */ - per_cpu(rcu_mb_flag, cpu) = rcu_mb_done; - } -} - -void rcu_check_callbacks(int cpu, int user) -{ - unsigned long flags; - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - - /* - * If this CPU took its interrupt from user mode or from the - * idle loop, and this is not a nested interrupt, then - * this CPU has to have exited all prior preept-disable - * sections of code. So increment the counter to note this. - * - * The memory barrier is needed to handle the case where - * writes from a preempt-disable section of code get reordered - * into schedule() by this CPU's write buffer. So the memory - * barrier makes sure that the rcu_qsctr_inc() is seen by other - * CPUs to happen after any such write. - */ - - if (user || - (idle_cpu(cpu) && !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) { - smp_mb(); /* Guard against aggressive schedule(). */ - rcu_qsctr_inc(cpu); - } - - rcu_check_mb(cpu); - if (rcu_ctrlblk.completed == rdp->completed) - rcu_try_flip(); - spin_lock_irqsave(&rdp->lock, flags); - RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); - __rcu_advance_callbacks(rdp); - if (rdp->donelist == NULL) { - spin_unlock_irqrestore(&rdp->lock, flags); - } else { - spin_unlock_irqrestore(&rdp->lock, flags); - raise_softirq(RCU_SOFTIRQ); - } -} - -/* - * Needed by dynticks, to make sure all RCU processing has finished - * when we go idle: - */ -void rcu_advance_callbacks(int cpu, int user) -{ - unsigned long flags; - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - - if (rcu_ctrlblk.completed == rdp->completed) { - rcu_try_flip(); - if (rcu_ctrlblk.completed == rdp->completed) - return; - } - spin_lock_irqsave(&rdp->lock, flags); - RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); - __rcu_advance_callbacks(rdp); - spin_unlock_irqrestore(&rdp->lock, flags); -} - -#ifdef CONFIG_HOTPLUG_CPU -#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \ - *dsttail = srclist; \ - if (srclist != NULL) { \ - dsttail = srctail; \ - srclist = NULL; \ - srctail = &srclist;\ - } \ - } while (0) - -void rcu_offline_cpu(int cpu) -{ - int i; - struct rcu_head *list = NULL; - unsigned long flags; - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - struct rcu_head *schedlist = NULL; - struct rcu_head **schedtail = &schedlist; - struct rcu_head **tail = &list; - - /* - * Remove all callbacks from the newly dead CPU, retaining order. - * Otherwise rcu_barrier() will fail - */ - - spin_lock_irqsave(&rdp->lock, flags); - rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail); - for (i = GP_STAGES - 1; i >= 0; i--) - rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], - list, tail); - rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); - rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail, - schedlist, schedtail); - rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail, - schedlist, schedtail); - rdp->rcu_sched_sleeping = 0; - spin_unlock_irqrestore(&rdp->lock, flags); - rdp->waitlistcount = 0; - - /* Disengage the newly dead CPU from the grace-period computation. */ - - spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); - rcu_check_mb(cpu); - if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { - smp_mb(); /* Subsequent counter accesses must see new value */ - per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; - smp_mb(); /* Subsequent RCU read-side critical sections */ - /* seen -after- acknowledgement. */ - } - - RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0]; - RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1]; - - RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; - RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; - - cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map)); - - spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); - - /* - * Place the removed callbacks on the current CPU's queue. - * Make them all start a new grace period: simple approach, - * in theory could starve a given set of callbacks, but - * you would need to be doing some serious CPU hotplugging - * to make this happen. If this becomes a problem, adding - * a synchronize_rcu() to the hotplug path would be a simple - * fix. - */ - - local_irq_save(flags); /* disable preempt till we know what lock. */ - rdp = RCU_DATA_ME(); - spin_lock(&rdp->lock); - *rdp->nexttail = list; - if (list) - rdp->nexttail = tail; - *rdp->nextschedtail = schedlist; - if (schedlist) - rdp->nextschedtail = schedtail; - spin_unlock_irqrestore(&rdp->lock, flags); -} - -#else /* #ifdef CONFIG_HOTPLUG_CPU */ - -void rcu_offline_cpu(int cpu) -{ -} - -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ - -void __cpuinit rcu_online_cpu(int cpu) -{ - unsigned long flags; - struct rcu_data *rdp; - - spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); - cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map)); - spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); - - /* - * The rcu_sched grace-period processing might have bypassed - * this CPU, given that it was not in the rcu_cpu_online_map - * when the grace-period scan started. This means that the - * grace-period task might sleep. So make sure that if this - * should happen, the first callback posted to this CPU will - * wake up the grace-period task if need be. - */ - - rdp = RCU_DATA_CPU(cpu); - spin_lock_irqsave(&rdp->lock, flags); - rdp->rcu_sched_sleeping = 1; - spin_unlock_irqrestore(&rdp->lock, flags); -} - -static void rcu_process_callbacks(struct softirq_action *unused) -{ - unsigned long flags; - struct rcu_head *next, *list; - struct rcu_data *rdp; - - local_irq_save(flags); - rdp = RCU_DATA_ME(); - spin_lock(&rdp->lock); - list = rdp->donelist; - if (list == NULL) { - spin_unlock_irqrestore(&rdp->lock, flags); - return; - } - rdp->donelist = NULL; - rdp->donetail = &rdp->donelist; - RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp); - spin_unlock_irqrestore(&rdp->lock, flags); - while (list) { - next = list->next; - list->func(list); - list = next; - RCU_TRACE_ME(rcupreempt_trace_invoke); - } -} - -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - struct rcu_data *rdp; - - head->func = func; - head->next = NULL; - local_irq_save(flags); - rdp = RCU_DATA_ME(); - spin_lock(&rdp->lock); - __rcu_advance_callbacks(rdp); - *rdp->nexttail = head; - rdp->nexttail = &head->next; - RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); - spin_unlock_irqrestore(&rdp->lock, flags); -} -EXPORT_SYMBOL_GPL(call_rcu); - -void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - struct rcu_data *rdp; - int wake_gp = 0; - - head->func = func; - head->next = NULL; - local_irq_save(flags); - rdp = RCU_DATA_ME(); - spin_lock(&rdp->lock); - *rdp->nextschedtail = head; - rdp->nextschedtail = &head->next; - if (rdp->rcu_sched_sleeping) { - - /* Grace-period processing might be sleeping... */ - - rdp->rcu_sched_sleeping = 0; - wake_gp = 1; - } - spin_unlock_irqrestore(&rdp->lock, flags); - if (wake_gp) { - - /* Wake up grace-period processing, unless someone beat us. */ - - spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); - if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping) - wake_gp = 0; - rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping; - spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - if (wake_gp) - wake_up_interruptible(&rcu_ctrlblk.sched_wq); - } -} -EXPORT_SYMBOL_GPL(call_rcu_sched); - -/* - * Wait until all currently running preempt_disable() code segments - * (including hardware-irq-disable segments) complete. Note that - * in -rt this does -not- necessarily result in all currently executing - * interrupt -handlers- having completed. - */ -void __synchronize_sched(void) -{ - struct rcu_synchronize rcu; - - if (num_online_cpus() == 1) - return; /* blocking is gp if only one CPU! */ - - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_sched(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); -} -EXPORT_SYMBOL_GPL(__synchronize_sched); - -/* - * kthread function that manages call_rcu_sched grace periods. - */ -static int rcu_sched_grace_period(void *arg) -{ - int couldsleep; /* might sleep after current pass. */ - int couldsleepnext = 0; /* might sleep after next pass. */ - int cpu; - unsigned long flags; - struct rcu_data *rdp; - int ret; - - /* - * Each pass through the following loop handles one - * rcu_sched grace period cycle. - */ - do { - /* Save each CPU's current state. */ - - for_each_online_cpu(cpu) { - dyntick_save_progress_counter_sched(cpu); - save_qsctr_sched(cpu); - } - - /* - * Sleep for about an RCU grace-period's worth to - * allow better batching and to consume less CPU. - */ - schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME); - - /* - * If there was nothing to do last time, prepare to - * sleep at the end of the current grace period cycle. - */ - couldsleep = couldsleepnext; - couldsleepnext = 1; - if (couldsleep) { - spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); - rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep; - spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - } - - /* - * Wait on each CPU in turn to have either visited - * a quiescent state or been in dynticks-idle mode. - */ - for_each_online_cpu(cpu) { - while (rcu_qsctr_inc_needed(cpu) && - rcu_qsctr_inc_needed_dyntick(cpu)) { - /* resched_cpu(cpu); @@@ */ - schedule_timeout_interruptible(1); - } - } - - /* Advance callbacks for each CPU. */ - - for_each_online_cpu(cpu) { - - rdp = RCU_DATA_CPU(cpu); - spin_lock_irqsave(&rdp->lock, flags); - - /* - * We are running on this CPU irq-disabled, so no - * CPU can go offline until we re-enable irqs. - * The current CPU might have already gone - * offline (between the for_each_offline_cpu and - * the spin_lock_irqsave), but in that case all its - * callback lists will be empty, so no harm done. - * - * Advance the callbacks! We share normal RCU's - * donelist, since callbacks are invoked the - * same way in either case. - */ - if (rdp->waitschedlist != NULL) { - *rdp->donetail = rdp->waitschedlist; - rdp->donetail = rdp->waitschedtail; - - /* - * Next rcu_check_callbacks() will - * do the required raise_softirq(). - */ - } - if (rdp->nextschedlist != NULL) { - rdp->waitschedlist = rdp->nextschedlist; - rdp->waitschedtail = rdp->nextschedtail; - couldsleep = 0; - couldsleepnext = 0; - } else { - rdp->waitschedlist = NULL; - rdp->waitschedtail = &rdp->waitschedlist; - } - rdp->nextschedlist = NULL; - rdp->nextschedtail = &rdp->nextschedlist; - - /* Mark sleep intention. */ - - rdp->rcu_sched_sleeping = couldsleep; - - spin_unlock_irqrestore(&rdp->lock, flags); - } - - /* If we saw callbacks on the last scan, go deal with them. */ - - if (!couldsleep) - continue; - - /* Attempt to block... */ - - spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); - if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) { - - /* - * Someone posted a callback after we scanned. - * Go take care of it. - */ - spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - couldsleepnext = 0; - continue; - } - - /* Block until the next person posts a callback. */ - - rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; - spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - ret = 0; /* unused */ - __wait_event_interruptible(rcu_ctrlblk.sched_wq, - rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, - ret); - - couldsleepnext = 0; - - } while (!kthread_should_stop()); - - return (0); -} - -/* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. Assumes that notifiers would take care of handling any - * outstanding requests from the RCU core. - * - * This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. - */ -int rcu_needs_cpu(int cpu) -{ - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - - return (rdp->donelist != NULL || - !!rdp->waitlistcount || - rdp->nextlist != NULL || - rdp->nextschedlist != NULL || - rdp->waitschedlist != NULL); -} - -int rcu_pending(int cpu) -{ - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - - /* The CPU has at least one callback queued somewhere. */ - - if (rdp->donelist != NULL || - !!rdp->waitlistcount || - rdp->nextlist != NULL || - rdp->nextschedlist != NULL || - rdp->waitschedlist != NULL) - return 1; - - /* The RCU core needs an acknowledgement from this CPU. */ - - if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) || - (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed)) - return 1; - - /* This CPU has fallen behind the global grace-period number. */ - - if (rdp->completed != rcu_ctrlblk.completed) - return 1; - - /* Nothing needed from this CPU. */ - - return 0; -} - -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - rcu_online_cpu(cpu); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - rcu_offline_cpu(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata rcu_nb = { - .notifier_call = rcu_cpu_notify, -}; - -void __init __rcu_init(void) -{ - int cpu; - int i; - struct rcu_data *rdp; - - printk(KERN_NOTICE "Preemptible RCU implementation.\n"); - for_each_possible_cpu(cpu) { - rdp = RCU_DATA_CPU(cpu); - spin_lock_init(&rdp->lock); - rdp->completed = 0; - rdp->waitlistcount = 0; - rdp->nextlist = NULL; - rdp->nexttail = &rdp->nextlist; - for (i = 0; i < GP_STAGES; i++) { - rdp->waitlist[i] = NULL; - rdp->waittail[i] = &rdp->waitlist[i]; - } - rdp->donelist = NULL; - rdp->donetail = &rdp->donelist; - rdp->rcu_flipctr[0] = 0; - rdp->rcu_flipctr[1] = 0; - rdp->nextschedlist = NULL; - rdp->nextschedtail = &rdp->nextschedlist; - rdp->waitschedlist = NULL; - rdp->waitschedtail = &rdp->waitschedlist; - rdp->rcu_sched_sleeping = 0; - } - register_cpu_notifier(&rcu_nb); - - /* - * We don't need protection against CPU-Hotplug here - * since - * a) If a CPU comes online while we are iterating over the - * cpu_online_mask below, we would only end up making a - * duplicate call to rcu_online_cpu() which sets the corresponding - * CPU's mask in the rcu_cpu_online_map. - * - * b) A CPU cannot go offline at this point in time since the user - * does not have access to the sysfs interface, nor do we - * suspend the system. - */ - for_each_online_cpu(cpu) - rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu); - - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); -} - -/* - * Late-boot-time RCU initialization that must wait until after scheduler - * has been initialized. - */ -void __init rcu_init_sched(void) -{ - rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period, - NULL, - "rcu_sched_grace_period"); - WARN_ON(IS_ERR(rcu_sched_grace_period_task)); -} - -#ifdef CONFIG_RCU_TRACE -long *rcupreempt_flipctr(int cpu) -{ - return &RCU_DATA_CPU(cpu)->rcu_flipctr[0]; -} -EXPORT_SYMBOL_GPL(rcupreempt_flipctr); - -int rcupreempt_flip_flag(int cpu) -{ - return per_cpu(rcu_flip_flag, cpu); -} -EXPORT_SYMBOL_GPL(rcupreempt_flip_flag); - -int rcupreempt_mb_flag(int cpu) -{ - return per_cpu(rcu_mb_flag, cpu); -} -EXPORT_SYMBOL_GPL(rcupreempt_mb_flag); - -char *rcupreempt_try_flip_state_name(void) -{ - return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state]; -} -EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name); - -struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu) -{ - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - - return &rdp->trace; -} -EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu); - -#endif /* #ifdef RCU_TRACE */ diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c deleted file mode 100644 index 7c2665c..0000000 --- a/kernel/rcupreempt_trace.c +++ /dev/null @@ -1,334 +0,0 @@ -/* - * Read-Copy Update tracing for realtime implementation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2006 - * - * Papers: http://www.rdrop.com/users/paulmck/RCU - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt - * - */ -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/spinlock.h> -#include <linux/smp.h> -#include <linux/rcupdate.h> -#include <linux/interrupt.h> -#include <linux/sched.h> -#include <asm/atomic.h> -#include <linux/bitops.h> -#include <linux/module.h> -#include <linux/completion.h> -#include <linux/moduleparam.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/cpu.h> -#include <linux/mutex.h> -#include <linux/rcupreempt_trace.h> -#include <linux/debugfs.h> - -static struct mutex rcupreempt_trace_mutex; -static char *rcupreempt_trace_buf; -#define RCUPREEMPT_TRACE_BUF_SIZE 4096 - -void rcupreempt_trace_move2done(struct rcupreempt_trace *trace) -{ - trace->done_length += trace->wait_length; - trace->done_add += trace->wait_length; - trace->wait_length = 0; -} -void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace) -{ - trace->wait_length += trace->next_length; - trace->wait_add += trace->next_length; - trace->next_length = 0; -} -void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace) -{ - atomic_inc(&trace->rcu_try_flip_1); -} -void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace) -{ - atomic_inc(&trace->rcu_try_flip_e1); -} -void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_i1++; -} -void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_ie1++; -} -void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_g1++; -} -void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_a1++; -} -void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_ae1++; -} -void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_a2++; -} -void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_z1++; -} -void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_ze1++; -} -void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_z2++; -} -void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_m1++; -} -void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_me1++; -} -void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_m2++; -} -void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace) -{ - trace->rcu_check_callbacks++; -} -void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace) -{ - trace->done_remove += trace->done_length; - trace->done_length = 0; -} -void rcupreempt_trace_invoke(struct rcupreempt_trace *trace) -{ - atomic_inc(&trace->done_invoked); -} -void rcupreempt_trace_next_add(struct rcupreempt_trace *trace) -{ - trace->next_add++; - trace->next_length++; -} - -static void rcupreempt_trace_sum(struct rcupreempt_trace *sp) -{ - struct rcupreempt_trace *cp; - int cpu; - - memset(sp, 0, sizeof(*sp)); - for_each_possible_cpu(cpu) { - cp = rcupreempt_trace_cpu(cpu); - sp->next_length += cp->next_length; - sp->next_add += cp->next_add; - sp->wait_length += cp->wait_length; - sp->wait_add += cp->wait_add; - sp->done_length += cp->done_length; - sp->done_add += cp->done_add; - sp->done_remove += cp->done_remove; - atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked); - sp->rcu_check_callbacks += cp->rcu_check_callbacks; - atomic_add(atomic_read(&cp->rcu_try_flip_1), - &sp->rcu_try_flip_1); - atomic_add(atomic_read(&cp->rcu_try_flip_e1), - &sp->rcu_try_flip_e1); - sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; - sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; - sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; - sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1; - sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1; - sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2; - sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1; - sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1; - sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2; - sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1; - sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1; - sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2; - } -} - -static ssize_t rcustats_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) -{ - struct rcupreempt_trace trace; - ssize_t bcount; - int cnt = 0; - - rcupreempt_trace_sum(&trace); - mutex_lock(&rcupreempt_trace_mutex); - snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, - "ggp=%ld rcc=%ld\n", - rcu_batches_completed(), - trace.rcu_check_callbacks); - snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, - "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n" - "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n" - "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n", - - trace.next_add, trace.next_length, - trace.wait_add, trace.wait_length, - trace.done_add, trace.done_length, - trace.done_remove, atomic_read(&trace.done_invoked), - atomic_read(&trace.rcu_try_flip_1), - atomic_read(&trace.rcu_try_flip_e1), - trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1, - trace.rcu_try_flip_g1, - trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1, - trace.rcu_try_flip_a2, - trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1, - trace.rcu_try_flip_z2, - trace.rcu_try_flip_m1, trace.rcu_try_flip_me1, - trace.rcu_try_flip_m2); - bcount = simple_read_from_buffer(buffer, count, ppos, - rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); - mutex_unlock(&rcupreempt_trace_mutex); - return bcount; -} - -static ssize_t rcugp_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) -{ - long oldgp = rcu_batches_completed(); - ssize_t bcount; - - mutex_lock(&rcupreempt_trace_mutex); - synchronize_rcu(); - snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE, - "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed()); - bcount = simple_read_from_buffer(buffer, count, ppos, - rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); - mutex_unlock(&rcupreempt_trace_mutex); - return bcount; -} - -static ssize_t rcuctrs_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) -{ - int cnt = 0; - int cpu; - int f = rcu_batches_completed() & 0x1; - ssize_t bcount; - - mutex_lock(&rcupreempt_trace_mutex); - - cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE, - "CPU last cur F M\n"); - for_each_online_cpu(cpu) { - long *flipctr = rcupreempt_flipctr(cpu); - cnt += snprintf(&rcupreempt_trace_buf[cnt], - RCUPREEMPT_TRACE_BUF_SIZE - cnt, - "%3d %4ld %3ld %d %d\n", - cpu, - flipctr[!f], - flipctr[f], - rcupreempt_flip_flag(cpu), - rcupreempt_mb_flag(cpu)); - } - cnt += snprintf(&rcupreempt_trace_buf[cnt], - RCUPREEMPT_TRACE_BUF_SIZE - cnt, - "ggp = %ld, state = %s\n", - rcu_batches_completed(), - rcupreempt_try_flip_state_name()); - cnt += snprintf(&rcupreempt_trace_buf[cnt], - RCUPREEMPT_TRACE_BUF_SIZE - cnt, - "\n"); - bcount = simple_read_from_buffer(buffer, count, ppos, - rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); - mutex_unlock(&rcupreempt_trace_mutex); - return bcount; -} - -static struct file_operations rcustats_fops = { - .owner = THIS_MODULE, - .read = rcustats_read, -}; - -static struct file_operations rcugp_fops = { - .owner = THIS_MODULE, - .read = rcugp_read, -}; - -static struct file_operations rcuctrs_fops = { - .owner = THIS_MODULE, - .read = rcuctrs_read, -}; - -static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir; -static int rcupreempt_debugfs_init(void) -{ - rcudir = debugfs_create_dir("rcu", NULL); - if (!rcudir) - goto out; - statdir = debugfs_create_file("rcustats", 0444, rcudir, - NULL, &rcustats_fops); - if (!statdir) - goto free_out; - - gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); - if (!gpdir) - goto free_out; - - ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir, - NULL, &rcuctrs_fops); - if (!ctrsdir) - goto free_out; - return 0; -free_out: - if (statdir) - debugfs_remove(statdir); - if (gpdir) - debugfs_remove(gpdir); - debugfs_remove(rcudir); -out: - return 1; -} - -static int __init rcupreempt_trace_init(void) -{ - int ret; - - mutex_init(&rcupreempt_trace_mutex); - rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); - if (!rcupreempt_trace_buf) - return 1; - ret = rcupreempt_debugfs_init(); - if (ret) - kfree(rcupreempt_trace_buf); - return ret; -} - -static void __exit rcupreempt_trace_cleanup(void) -{ - debugfs_remove(statdir); - debugfs_remove(gpdir); - debugfs_remove(ctrsdir); - debugfs_remove(rcudir); - kfree(rcupreempt_trace_buf); -} - - -module_init(rcupreempt_trace_init); -module_exit(rcupreempt_trace_cleanup); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9b4a975..233768f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -18,7 +18,7 @@ * Copyright (C) IBM Corporation, 2005, 2006 * * Authors: Paul E. McKenney <paulmck@us.ibm.com> - * Josh Triplett <josh@freedesktop.org> + * Josh Triplett <josh@freedesktop.org> * * See also: Documentation/RCU/torture.txt */ @@ -50,7 +50,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " - "Josh Triplett <josh@freedesktop.org>"); + "Josh Triplett <josh@freedesktop.org>"); static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ static int nfakewriters = 4; /* # fake writer threads */ @@ -110,8 +110,8 @@ struct rcu_torture { }; static LIST_HEAD(rcu_torture_freelist); -static struct rcu_torture *rcu_torture_current = NULL; -static long rcu_torture_current_version = 0; +static struct rcu_torture *rcu_torture_current; +static long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = @@ -124,11 +124,11 @@ static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; -static long n_rcu_torture_timers = 0; +static long n_rcu_torture_timers; static struct list_head rcu_torture_removed; static cpumask_var_t shuffle_tmp_mask; -static int stutter_pause_test = 0; +static int stutter_pause_test; #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 @@ -257,17 +257,18 @@ struct rcu_torture_ops { void (*init)(void); void (*cleanup)(void); int (*readlock)(void); - void (*readdelay)(struct rcu_random_state *rrsp); + void (*read_delay)(struct rcu_random_state *rrsp); void (*readunlock)(int idx); int (*completed)(void); - void (*deferredfree)(struct rcu_torture *p); + void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*cb_barrier)(void); int (*stats)(char *page); - int irqcapable; + int irq_capable; char *name; }; -static struct rcu_torture_ops *cur_ops = NULL; + +static struct rcu_torture_ops *cur_ops; /* * Definitions for rcu torture testing. @@ -281,14 +282,17 @@ static int rcu_torture_read_lock(void) __acquires(RCU) static void rcu_read_delay(struct rcu_random_state *rrsp) { - long delay; - const long longdelay = 200; + const unsigned long shortdelay_us = 200; + const unsigned long longdelay_ms = 50; - /* We want there to be long-running readers, but not all the time. */ + /* We want a short delay sometimes to make a reader delay the grace + * period, and we want a long delay occasionally to trigger + * force_quiescent_state. */ - delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay); - if (!delay) - udelay(longdelay); + if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) + udelay(shortdelay_us); } static void rcu_torture_read_unlock(int idx) __releases(RCU) @@ -320,7 +324,7 @@ rcu_torture_cb(struct rcu_head *p) rp->rtort_mbtest = 0; rcu_torture_free(rp); } else - cur_ops->deferredfree(rp); + cur_ops->deferred_free(rp); } static void rcu_torture_deferred_free(struct rcu_torture *p) @@ -329,18 +333,18 @@ static void rcu_torture_deferred_free(struct rcu_torture *p) } static struct rcu_torture_ops rcu_ops = { - .init = NULL, - .cleanup = NULL, - .readlock = rcu_torture_read_lock, - .readdelay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferredfree = rcu_torture_deferred_free, - .sync = synchronize_rcu, - .cb_barrier = rcu_barrier, - .stats = NULL, - .irqcapable = 1, - .name = "rcu" + .init = NULL, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferred_free = rcu_torture_deferred_free, + .sync = synchronize_rcu, + .cb_barrier = rcu_barrier, + .stats = NULL, + .irq_capable = 1, + .name = "rcu" }; static void rcu_sync_torture_deferred_free(struct rcu_torture *p) @@ -370,18 +374,18 @@ static void rcu_sync_torture_init(void) } static struct rcu_torture_ops rcu_sync_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = rcu_torture_read_lock, - .readdelay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu, - .cb_barrier = NULL, - .stats = NULL, - .irqcapable = 1, - .name = "rcu_sync" + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = synchronize_rcu, + .cb_barrier = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_sync" }; /* @@ -432,33 +436,33 @@ static void rcu_bh_torture_synchronize(void) } static struct rcu_torture_ops rcu_bh_ops = { - .init = NULL, - .cleanup = NULL, - .readlock = rcu_bh_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferredfree = rcu_bh_torture_deferred_free, - .sync = rcu_bh_torture_synchronize, - .cb_barrier = rcu_barrier_bh, - .stats = NULL, - .irqcapable = 1, - .name = "rcu_bh" + .init = NULL, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferred_free = rcu_bh_torture_deferred_free, + .sync = rcu_bh_torture_synchronize, + .cb_barrier = rcu_barrier_bh, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_bh" }; static struct rcu_torture_ops rcu_bh_sync_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = rcu_bh_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = rcu_bh_torture_synchronize, - .cb_barrier = NULL, - .stats = NULL, - .irqcapable = 1, - .name = "rcu_bh_sync" + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = rcu_bh_torture_synchronize, + .cb_barrier = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_bh_sync" }; /* @@ -530,17 +534,17 @@ static int srcu_torture_stats(char *page) } static struct rcu_torture_ops srcu_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, - .readlock = srcu_torture_read_lock, - .readdelay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock, - .completed = srcu_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu" + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .completed = srcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = srcu_torture_synchronize, + .cb_barrier = NULL, + .stats = srcu_torture_stats, + .name = "srcu" }; /* @@ -574,32 +578,49 @@ static void sched_torture_synchronize(void) } static struct rcu_torture_ops sched_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = sched_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, - .deferredfree = rcu_sched_torture_deferred_free, - .sync = sched_torture_synchronize, - .cb_barrier = rcu_barrier_sched, - .stats = NULL, - .irqcapable = 1, - .name = "sched" + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = sched_torture_completed, + .deferred_free = rcu_sched_torture_deferred_free, + .sync = sched_torture_synchronize, + .cb_barrier = rcu_barrier_sched, + .stats = NULL, + .irq_capable = 1, + .name = "sched" }; static struct rcu_torture_ops sched_ops_sync = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = sched_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = sched_torture_synchronize, - .cb_barrier = NULL, - .stats = NULL, - .name = "sched_sync" + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = sched_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = sched_torture_synchronize, + .cb_barrier = NULL, + .stats = NULL, + .name = "sched_sync" +}; + +extern int rcu_expedited_torture_stats(char *page); + +static struct rcu_torture_ops sched_expedited_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = sched_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = synchronize_sched_expedited, + .cb_barrier = NULL, + .stats = rcu_expedited_torture_stats, + .irq_capable = 1, + .name = "sched_expedited" }; /* @@ -621,7 +642,8 @@ rcu_torture_writer(void *arg) do { schedule_timeout_uninterruptible(1); - if ((rp = rcu_torture_alloc()) == NULL) + rp = rcu_torture_alloc(); + if (rp == NULL) continue; rp->rtort_pipe_count = 0; udelay(rcu_random(&rand) & 0x3ff); @@ -635,7 +657,7 @@ rcu_torture_writer(void *arg) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); old_rp->rtort_pipe_count++; - cur_ops->deferredfree(old_rp); + cur_ops->deferred_free(old_rp); } rcu_torture_current_version++; oldbatch = cur_ops->completed(); @@ -700,7 +722,7 @@ static void rcu_torture_timer(unsigned long unused) if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); spin_lock(&rand_lock); - cur_ops->readdelay(&rand); + cur_ops->read_delay(&rand); n_rcu_torture_timers++; spin_unlock(&rand_lock); preempt_disable(); @@ -738,11 +760,11 @@ rcu_torture_reader(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); set_user_nice(current, 19); - if (irqreader && cur_ops->irqcapable) + if (irqreader && cur_ops->irq_capable) setup_timer_on_stack(&t, rcu_torture_timer, 0); do { - if (irqreader && cur_ops->irqcapable) { + if (irqreader && cur_ops->irq_capable) { if (!timer_pending(&t)) mod_timer(&t, 1); } @@ -757,7 +779,7 @@ rcu_torture_reader(void *arg) } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - cur_ops->readdelay(&rand); + cur_ops->read_delay(&rand); preempt_disable(); pipe_count = p->rtort_pipe_count; if (pipe_count > RCU_TORTURE_PIPE_LEN) { @@ -778,7 +800,7 @@ rcu_torture_reader(void *arg) } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); rcutorture_shutdown_absorb("rcu_torture_reader"); - if (irqreader && cur_ops->irqcapable) + if (irqreader && cur_ops->irq_capable) del_timer_sync(&t); while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); @@ -1078,6 +1100,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, + &sched_expedited_ops, &srcu_ops, &sched_ops, &sched_ops_sync, }; mutex_lock(&fullstop_mutex); @@ -1092,7 +1115,7 @@ rcu_torture_init(void) printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", torture_type); mutex_unlock(&fullstop_mutex); - return (-EINVAL); + return -EINVAL; } if (cur_ops->init) cur_ops->init(); /* no "goto unwind" prior to this point!!! */ @@ -1143,7 +1166,7 @@ rcu_torture_init(void) goto unwind; } fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), - GFP_KERNEL); + GFP_KERNEL); if (fakewriter_tasks == NULL) { VERBOSE_PRINTK_ERRSTRING("out of memory"); firsterr = -ENOMEM; @@ -1152,7 +1175,7 @@ rcu_torture_init(void) for (i = 0; i < nfakewriters; i++) { VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, - "rcu_torture_fakewriter"); + "rcu_torture_fakewriter"); if (IS_ERR(fakewriter_tasks[i])) { firsterr = PTR_ERR(fakewriter_tasks[i]); VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7717b95..52b06f6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -25,7 +25,7 @@ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU + * Documentation/RCU */ #include <linux/types.h> #include <linux/kernel.h> @@ -35,6 +35,7 @@ #include <linux/rcupdate.h> #include <linux/interrupt.h> #include <linux/sched.h> +#include <linux/nmi.h> #include <asm/atomic.h> #include <linux/bitops.h> #include <linux/module.h> @@ -46,6 +47,8 @@ #include <linux/mutex.h> #include <linux/time.h> +#include "rcutree.h" + #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; struct lockdep_map rcu_lock_map = @@ -72,30 +75,55 @@ EXPORT_SYMBOL_GPL(rcu_lock_map); .n_force_qs_ngp = 0, \ } -struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state); -DEFINE_PER_CPU(struct rcu_data, rcu_data); +struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); +DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); +extern long rcu_batches_completed_sched(void); +static struct rcu_node *rcu_get_root(struct rcu_state *rsp); +static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long flags); +static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags); +#ifdef CONFIG_HOTPLUG_CPU +static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static void __rcu_process_callbacks(struct rcu_state *rsp, + struct rcu_data *rdp); +static void __call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu), + struct rcu_state *rsp); +static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp); +static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp, + int preemptable); + +#include "rcutree_plugin.h" + /* - * Increment the quiescent state counter. - * The counter is a bit degenerated: We do not need to know + * Note a quiescent state. Because we do not need to know * how many quiescent states passed, just if there was at least - * one since the start of the grace period. Thus just a flag. + * one since the start of the grace period, this just sets a flag. */ -void rcu_qsctr_inc(int cpu) +void rcu_sched_qs(int cpu) { - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - rdp->passed_quiesc = 1; + struct rcu_data *rdp; + + rdp = &per_cpu(rcu_sched_data, cpu); rdp->passed_quiesc_completed = rdp->completed; + barrier(); + rdp->passed_quiesc = 1; + rcu_preempt_note_context_switch(cpu); } -void rcu_bh_qsctr_inc(int cpu) +void rcu_bh_qs(int cpu) { - struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc = 1; + struct rcu_data *rdp; + + rdp = &per_cpu(rcu_bh_data, cpu); rdp->passed_quiesc_completed = rdp->completed; + barrier(); + rdp->passed_quiesc = 1; } #ifdef CONFIG_NO_HZ @@ -110,15 +138,16 @@ static int qhimark = 10000; /* If this many pending, ignore blimit. */ static int qlowmark = 100; /* Once only this many pending, use blimit. */ static void force_quiescent_state(struct rcu_state *rsp, int relaxed); +static int rcu_pending(int cpu); /* - * Return the number of RCU batches processed thus far for debug & stats. + * Return the number of RCU-sched batches processed thus far for debug & stats. */ -long rcu_batches_completed(void) +long rcu_batches_completed_sched(void) { - return rcu_state.completed; + return rcu_sched_state.completed; } -EXPORT_SYMBOL_GPL(rcu_batches_completed); +EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); /* * Return the number of RCU BH batches processed thus far for debug & stats. @@ -181,6 +210,10 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) return 1; } + /* If preemptable RCU, no point in sending reschedule IPI. */ + if (rdp->preemptable) + return 0; + /* The CPU is online, so send it a reschedule IPI. */ if (rdp->cpu != smp_processor_id()) smp_send_reschedule(rdp->cpu); @@ -193,7 +226,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) #endif /* #ifdef CONFIG_SMP */ #ifdef CONFIG_NO_HZ -static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5); /** * rcu_enter_nohz - inform RCU that current CPU is entering nohz @@ -213,7 +245,7 @@ void rcu_enter_nohz(void) rdtp = &__get_cpu_var(rcu_dynticks); rdtp->dynticks++; rdtp->dynticks_nesting--; - WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); + WARN_ON_ONCE(rdtp->dynticks & 0x1); local_irq_restore(flags); } @@ -232,7 +264,7 @@ void rcu_exit_nohz(void) rdtp = &__get_cpu_var(rcu_dynticks); rdtp->dynticks++; rdtp->dynticks_nesting++; - WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); + WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); local_irq_restore(flags); smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ } @@ -251,7 +283,7 @@ void rcu_nmi_enter(void) if (rdtp->dynticks & 0x1) return; rdtp->dynticks_nmi++; - WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs); + WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1)); smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ } @@ -270,7 +302,7 @@ void rcu_nmi_exit(void) return; smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ rdtp->dynticks_nmi++; - WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs); + WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1); } /** @@ -286,7 +318,7 @@ void rcu_irq_enter(void) if (rdtp->dynticks_nesting++) return; rdtp->dynticks++; - WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); + WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ } @@ -305,10 +337,10 @@ void rcu_irq_exit(void) return; smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ rdtp->dynticks++; - WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); + WARN_ON_ONCE(rdtp->dynticks & 0x1); /* If the interrupt queued a callback, get out of dyntick mode. */ - if (__get_cpu_var(rcu_data).nxtlist || + if (__get_cpu_var(rcu_sched_data).nxtlist || __get_cpu_var(rcu_bh_data).nxtlist) set_need_resched(); } @@ -461,6 +493,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) printk(KERN_ERR "INFO: RCU detected CPU stalls:"); for (; rnp_cur < rnp_end; rnp_cur++) { + rcu_print_task_stall(rnp); if (rnp_cur->qsmask == 0) continue; for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) @@ -469,6 +502,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp) } printk(" (detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); + trigger_all_cpu_backtrace(); + force_quiescent_state(rsp, 0); /* Kick them all. */ } @@ -479,12 +514,14 @@ static void print_cpu_stall(struct rcu_state *rsp) printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", smp_processor_id(), jiffies - rsp->gp_start); - dump_stack(); + trigger_all_cpu_backtrace(); + spin_lock_irqsave(&rnp->lock, flags); if ((long)(jiffies - rsp->jiffies_stall) >= 0) rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; spin_unlock_irqrestore(&rnp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ } @@ -564,8 +601,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) { struct rcu_data *rdp = rsp->rda[smp_processor_id()]; struct rcu_node *rnp = rcu_get_root(rsp); - struct rcu_node *rnp_cur; - struct rcu_node *rnp_end; if (!cpu_needs_another_gp(rsp, rdp)) { spin_unlock_irqrestore(&rnp->lock, flags); @@ -574,6 +609,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* Advance to a new grace period and initialize state. */ rsp->gpnum++; + WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); @@ -590,7 +626,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* Special-case the common single-level case. */ if (NUM_RCU_NODES == 1) { + rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; + rnp->gpnum = rsp->gpnum; rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -603,42 +641,28 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) spin_lock(&rsp->onofflock); /* irqs already disabled. */ /* - * Set the quiescent-state-needed bits in all the non-leaf RCU - * nodes for all currently online CPUs. This operation relies - * on the layout of the hierarchy within the rsp->node[] array. - * Note that other CPUs will access only the leaves of the - * hierarchy, which still indicate that no grace period is in - * progress. In addition, we have excluded CPU-hotplug operations. - * - * We therefore do not need to hold any locks. Any required - * memory barriers will be supplied by the locks guarding the - * leaf rcu_nodes in the hierarchy. - */ - - rnp_end = rsp->level[NUM_RCU_LVLS - 1]; - for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++) - rnp_cur->qsmask = rnp_cur->qsmaskinit; - - /* - * Now set up the leaf nodes. Here we must be careful. First, - * we need to hold the lock in order to exclude other CPUs, which - * might be contending for the leaf nodes' locks. Second, as - * soon as we initialize a given leaf node, its CPUs might run - * up the rest of the hierarchy. We must therefore acquire locks - * for each node that we touch during this stage. (But we still - * are excluding CPU-hotplug operations.) + * Set the quiescent-state-needed bits in all the rcu_node + * structures for all currently online CPUs in breadth-first + * order, starting from the root rcu_node structure. This + * operation relies on the layout of the hierarchy within the + * rsp->node[] array. Note that other CPUs will access only + * the leaves of the hierarchy, which still indicate that no + * grace period is in progress, at least until the corresponding + * leaf node has been initialized. In addition, we have excluded + * CPU-hotplug operations. * * Note that the grace period cannot complete until we finish * the initialization process, as there will be at least one * qsmask bit set in the root node until that time, namely the - * one corresponding to this CPU. + * one corresponding to this CPU, due to the fact that we have + * irqs disabled. */ - rnp_end = &rsp->node[NUM_RCU_NODES]; - rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; - for (; rnp_cur < rnp_end; rnp_cur++) { - spin_lock(&rnp_cur->lock); /* irqs already disabled. */ - rnp_cur->qsmask = rnp_cur->qsmaskinit; - spin_unlock(&rnp_cur->lock); /* irqs already disabled. */ + for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) { + spin_lock(&rnp->lock); /* irqs already disabled. */ + rcu_preempt_check_blocked_tasks(rnp); + rnp->qsmask = rnp->qsmaskinit; + rnp->gpnum = rsp->gpnum; + spin_unlock(&rnp->lock); /* irqs already disabled. */ } rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ @@ -674,6 +698,20 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) } /* + * Clean up after the prior grace period and let rcu_start_gp() start up + * the next grace period if one is needed. Note that the caller must + * hold rnp->lock, as required by rcu_start_gp(), which will release it. + */ +static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) + __releases(rnp->lock) +{ + WARN_ON_ONCE(rsp->completed == rsp->gpnum); + rsp->completed = rsp->gpnum; + rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); + rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ +} + +/* * Similar to cpu_quiet(), for which it is a helper function. Allows * a group of CPUs to be quieted at one go, though all the CPUs in the * group must be represented by the same leaf rcu_node structure. @@ -685,6 +723,8 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { + struct rcu_node *rnp_c; + /* Walk up the rcu_node hierarchy. */ for (;;) { if (!(rnp->qsmask & mask)) { @@ -694,7 +734,7 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, return; } rnp->qsmask &= ~mask; - if (rnp->qsmask != 0) { + if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { /* Other bits still set at this level, so done. */ spin_unlock_irqrestore(&rnp->lock, flags); @@ -708,28 +748,26 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, break; } spin_unlock_irqrestore(&rnp->lock, flags); + rnp_c = rnp; rnp = rnp->parent; spin_lock_irqsave(&rnp->lock, flags); + WARN_ON_ONCE(rnp_c->qsmask); } /* * Get here if we are the last CPU to pass through a quiescent - * state for this grace period. Clean up and let rcu_start_gp() - * start up the next grace period if one is needed. Note that - * we still hold rnp->lock, as required by rcu_start_gp(), which - * will release it. + * state for this grace period. Invoke cpu_quiet_msk_finish() + * to clean up and start the next grace period if one is needed. */ - rsp->completed = rsp->gpnum; - rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); - rcu_start_gp(rsp, flags); /* releases rnp->lock. */ + cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */ } /* * Record a quiescent state for the specified CPU, which must either be - * the current CPU or an offline CPU. The lastcomp argument is used to - * make sure we are still in the grace period of interest. We don't want - * to end the current grace period based on quiescent states detected in - * an earlier grace period! + * the current CPU. The lastcomp argument is used to make sure we are + * still in the grace period of interest. We don't want to end the current + * grace period based on quiescent states detected in an earlier grace + * period! */ static void cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) @@ -764,7 +802,6 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. */ - rdp = rsp->rda[smp_processor_id()]; rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ @@ -822,30 +859,28 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_lock_irqsave(&rsp->onofflock, flags); /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ - rnp = rdp->mynode; + rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ do { spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; if (rnp->qsmaskinit != 0) { - spin_unlock(&rnp->lock); /* irqs already disabled. */ + spin_unlock(&rnp->lock); /* irqs remain disabled. */ break; } + rcu_preempt_offline_tasks(rsp, rnp, rdp); mask = rnp->grpmask; - spin_unlock(&rnp->lock); /* irqs already disabled. */ + spin_unlock(&rnp->lock); /* irqs remain disabled. */ rnp = rnp->parent; } while (rnp != NULL); lastcomp = rsp->completed; spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ - /* Being offline is a quiescent state, so go record it. */ - cpu_quiet(cpu, rsp, rdp, lastcomp); - /* * Move callbacks from the outgoing CPU to the running CPU. * Note that the outgoing CPU is now quiscent, so it is now - * (uncharacteristically) safe to access it rcu_data structure. + * (uncharacteristically) safe to access its rcu_data structure. * Note also that we must carefully retain the order of the * outgoing CPU's callbacks in order for rcu_barrier() to work * correctly. Finally, note that we start all the callbacks @@ -876,8 +911,9 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) */ static void rcu_offline_cpu(int cpu) { - __rcu_offline_cpu(cpu, &rcu_state); + __rcu_offline_cpu(cpu, &rcu_sched_state); __rcu_offline_cpu(cpu, &rcu_bh_state); + rcu_preempt_offline_cpu(cpu); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -963,6 +999,8 @@ static void rcu_do_batch(struct rcu_data *rdp) */ void rcu_check_callbacks(int cpu, int user) { + if (!rcu_pending(cpu)) + return; /* if nothing for RCU to do. */ if (user || (idle_cpu(cpu) && rcu_scheduler_active && !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { @@ -971,17 +1009,16 @@ void rcu_check_callbacks(int cpu, int user) * Get here if this CPU took its interrupt from user * mode or from the idle loop, and if this is not a * nested interrupt. In this case, the CPU is in - * a quiescent state, so count it. + * a quiescent state, so note it. * * No memory barrier is required here because both - * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference - * only CPU-local variables that other CPUs neither - * access nor modify, at least not while the corresponding - * CPU is online. + * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local + * variables that other CPUs neither access nor modify, + * at least not while the corresponding CPU is online. */ - rcu_qsctr_inc(cpu); - rcu_bh_qsctr_inc(cpu); + rcu_sched_qs(cpu); + rcu_bh_qs(cpu); } else if (!in_softirq()) { @@ -989,11 +1026,12 @@ void rcu_check_callbacks(int cpu, int user) * Get here if this CPU did not take its interrupt from * softirq, in other words, if it is not interrupting * a rcu_bh read-side critical section. This is an _bh - * critical section, so count it. + * critical section, so note it. */ - rcu_bh_qsctr_inc(cpu); + rcu_bh_qs(cpu); } + rcu_preempt_check_callbacks(cpu); raise_softirq(RCU_SOFTIRQ); } @@ -1132,6 +1170,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; + WARN_ON_ONCE(rdp->beenonline == 0); + /* * If an RCU GP has gone long enough, go check for dyntick * idle CPUs and, if needed, send resched IPIs. @@ -1170,8 +1210,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) */ smp_mb(); /* See above block comment. */ - __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data)); + __rcu_process_callbacks(&rcu_sched_state, + &__get_cpu_var(rcu_sched_data)); __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); + rcu_preempt_process_callbacks(); /* * Memory references from any later RCU read-side critical sections @@ -1227,13 +1269,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), } /* - * Queue an RCU callback for invocation after a grace period. + * Queue an RCU-sched callback for invocation after a grace period. */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_state); + __call_rcu(head, func, &rcu_sched_state); } -EXPORT_SYMBOL_GPL(call_rcu); +EXPORT_SYMBOL_GPL(call_rcu_sched); /* * Queue an RCU for invocation after a quicker grace period. @@ -1305,10 +1347,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) * by the current CPU, returning 1 if so. This function is part of the * RCU implementation; it is -not- an exported member of the RCU API. */ -int rcu_pending(int cpu) +static int rcu_pending(int cpu) { - return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) || - __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)); + return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || + __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) || + rcu_preempt_pending(cpu); } /* @@ -1320,27 +1363,46 @@ int rcu_pending(int cpu) int rcu_needs_cpu(int cpu) { /* RCU callbacks either ready or pending? */ - return per_cpu(rcu_data, cpu).nxtlist || - per_cpu(rcu_bh_data, cpu).nxtlist; + return per_cpu(rcu_sched_data, cpu).nxtlist || + per_cpu(rcu_bh_data, cpu).nxtlist || + rcu_preempt_needs_cpu(cpu); } /* - * Initialize a CPU's per-CPU RCU data. We take this "scorched earth" - * approach so that we don't have to worry about how long the CPU has - * been gone, or whether it ever was online previously. We do trust the - * ->mynode field, as it is constant for a given struct rcu_data and - * initialized during early boot. - * - * Note that only one online or offline event can be happening at a given - * time. Note also that we can accept some slop in the rsp->completed - * access due to the fact that this CPU cannot possibly have any RCU - * callbacks in flight yet. + * Do boot-time initialization of a CPU's per-CPU RCU data. */ -static void __cpuinit -rcu_init_percpu_data(int cpu, struct rcu_state *rsp) +static void __init +rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; int i; + struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_node *rnp = rcu_get_root(rsp); + + /* Set up local state, ensuring consistent view of global state. */ + spin_lock_irqsave(&rnp->lock, flags); + rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + rdp->qlen = 0; +#ifdef CONFIG_NO_HZ + rdp->dynticks = &per_cpu(rcu_dynticks, cpu); +#endif /* #ifdef CONFIG_NO_HZ */ + rdp->cpu = cpu; + spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Initialize a CPU's per-CPU RCU data. Note that only one online or + * offline event can be happening at a given time. Note also that we + * can accept some slop in the rsp->completed access due to the fact + * that this CPU cannot possibly have any RCU callbacks in flight yet. + */ +static void __cpuinit +rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) +{ + unsigned long flags; long lastcomp; unsigned long mask; struct rcu_data *rdp = rsp->rda[cpu]; @@ -1354,17 +1416,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->passed_quiesc = 0; /* We could be racing with new GP, */ rdp->qs_pending = 1; /* so set up to respond to current GP. */ rdp->beenonline = 1; /* We have now been online. */ + rdp->preemptable = preemptable; rdp->passed_quiesc_completed = lastcomp - 1; - rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; - rdp->qlen = 0; rdp->blimit = blimit; -#ifdef CONFIG_NO_HZ - rdp->dynticks = &per_cpu(rcu_dynticks, cpu); -#endif /* #ifdef CONFIG_NO_HZ */ - rdp->cpu = cpu; spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* @@ -1387,34 +1441,21 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rnp = rnp->parent; } while (rnp != NULL && !(rnp->qsmaskinit & mask)); - spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ - - /* - * A new grace period might start here. If so, we will be part of - * it, and its gpnum will be greater than ours, so we will - * participate. It is also possible for the gpnum to have been - * incremented before this function was called, and the bitmasks - * to not be filled out until now, in which case we will also - * participate due to our gpnum being behind. - */ - - /* Since it is coming online, the CPU is in a quiescent state. */ - cpu_quiet(cpu, rsp, rdp, lastcomp); - local_irq_restore(flags); + spin_unlock_irqrestore(&rsp->onofflock, flags); } static void __cpuinit rcu_online_cpu(int cpu) { - rcu_init_percpu_data(cpu, &rcu_state); - rcu_init_percpu_data(cpu, &rcu_bh_state); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + rcu_init_percpu_data(cpu, &rcu_sched_state, 0); + rcu_init_percpu_data(cpu, &rcu_bh_state, 0); + rcu_preempt_init_percpu_data(cpu); } /* - * Handle CPU online/offline notifcation events. + * Handle CPU online/offline notification events. */ -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +int __cpuinit rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1486,6 +1527,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp = rsp->level[i]; for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { spin_lock_init(&rnp->lock); + rnp->gpnum = 0; rnp->qsmask = 0; rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; @@ -1503,16 +1545,20 @@ static void __init rcu_init_one(struct rcu_state *rsp) j / rsp->levelspread[i - 1]; } rnp->level = i; + INIT_LIST_HEAD(&rnp->blocked_tasks[0]); + INIT_LIST_HEAD(&rnp->blocked_tasks[1]); } } } /* - * Helper macro for __rcu_init(). To be used nowhere else! - * Assigns leaf node pointers into each CPU's rcu_data structure. + * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used + * nowhere else! Assigns leaf node pointers into each CPU's rcu_data + * structure. */ -#define RCU_DATA_PTR_INIT(rsp, rcu_data) \ +#define RCU_INIT_FLAVOR(rsp, rcu_data) \ do { \ + rcu_init_one(rsp); \ rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ j = 0; \ for_each_possible_cpu(i) { \ @@ -1520,32 +1566,43 @@ do { \ j++; \ per_cpu(rcu_data, i).mynode = &rnp[j]; \ (rsp)->rda[i] = &per_cpu(rcu_data, i); \ + rcu_boot_init_percpu_data(i, rsp); \ } \ } while (0) -static struct notifier_block __cpuinitdata rcu_nb = { - .notifier_call = rcu_cpu_notify, -}; +#ifdef CONFIG_TREE_PREEMPT_RCU + +void __init __rcu_init_preempt(void) +{ + int i; /* All used by RCU_INIT_FLAVOR(). */ + int j; + struct rcu_node *rnp; + + RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); +} + +#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +void __init __rcu_init_preempt(void) +{ +} + +#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ void __init __rcu_init(void) { - int i; /* All used by RCU_DATA_PTR_INIT(). */ + int i; /* All used by RCU_INIT_FLAVOR(). */ int j; struct rcu_node *rnp; - printk(KERN_INFO "Hierarchical RCU implementation.\n"); + rcu_bootup_announce(); #ifdef CONFIG_RCU_CPU_STALL_DETECTOR printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - rcu_init_one(&rcu_state); - RCU_DATA_PTR_INIT(&rcu_state, rcu_data); - rcu_init_one(&rcu_bh_state); - RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data); - - for_each_online_cpu(i) - rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i); - /* Register notifier for non-boot CPUs */ - register_cpu_notifier(&rcu_nb); + RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); + RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); + __rcu_init_preempt(); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } module_param(blimit, int, 0); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 5e872bb..8e8287a 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -1,10 +1,259 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Internal non-public definitions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Author: Ingo Molnar <mingo@elte.hu> + * Paul E. McKenney <paulmck@linux.vnet.ibm.com> + */ + +#include <linux/cache.h> +#include <linux/spinlock.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/seqlock.h> + +/* + * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. + * In theory, it should be possible to add more levels straightforwardly. + * In practice, this has not been tested, so there is probably some + * bug somewhere. + */ +#define MAX_RCU_LVLS 3 +#define RCU_FANOUT (CONFIG_RCU_FANOUT) +#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) +#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) + +#if NR_CPUS <= RCU_FANOUT +# define NUM_RCU_LVLS 1 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (NR_CPUS) +# define NUM_RCU_LVL_2 0 +# define NUM_RCU_LVL_3 0 +#elif NR_CPUS <= RCU_FANOUT_SQ +# define NUM_RCU_LVLS 2 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT) +# define NUM_RCU_LVL_2 (NR_CPUS) +# define NUM_RCU_LVL_3 0 +#elif NR_CPUS <= RCU_FANOUT_CUBE +# define NUM_RCU_LVLS 3 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ) +# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT)) +# define NUM_RCU_LVL_3 NR_CPUS +#else +# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" +#endif /* #if (NR_CPUS) <= RCU_FANOUT */ + +#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) + +/* + * Dynticks per-CPU state. + */ +struct rcu_dynticks { + int dynticks_nesting; /* Track nesting level, sort of. */ + int dynticks; /* Even value for dynticks-idle, else odd. */ + int dynticks_nmi; /* Even value for either dynticks-idle or */ + /* not in nmi handler, else odd. So this */ + /* remains even for nmi from irq handler. */ +}; + +/* + * Definition for node within the RCU grace-period-detection hierarchy. + */ +struct rcu_node { + spinlock_t lock; + long gpnum; /* Current grace period for this node. */ + /* This will either be equal to or one */ + /* behind the root rcu_node's gpnum. */ + unsigned long qsmask; /* CPUs or groups that need to switch in */ + /* order for current grace period to proceed.*/ + unsigned long qsmaskinit; + /* Per-GP initialization for qsmask. */ + unsigned long grpmask; /* Mask to apply to parent qsmask. */ + int grplo; /* lowest-numbered CPU or group here. */ + int grphi; /* highest-numbered CPU or group here. */ + u8 grpnum; /* CPU/group number for next level up. */ + u8 level; /* root is at level 0. */ + struct rcu_node *parent; + struct list_head blocked_tasks[2]; + /* Tasks blocked in RCU read-side critsect. */ +} ____cacheline_internodealigned_in_smp; + +/* Index values for nxttail array in struct rcu_data. */ +#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ +#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ +#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ +#define RCU_NEXT_TAIL 3 +#define RCU_NEXT_SIZE 4 + +/* Per-CPU data for read-copy update. */ +struct rcu_data { + /* 1) quiescent-state and grace-period handling : */ + long completed; /* Track rsp->completed gp number */ + /* in order to detect GP end. */ + long gpnum; /* Highest gp number that this CPU */ + /* is aware of having started. */ + long passed_quiesc_completed; + /* Value of completed at time of qs. */ + bool passed_quiesc; /* User-mode/idle loop etc. */ + bool qs_pending; /* Core waits for quiesc state. */ + bool beenonline; /* CPU online at least once. */ + bool preemptable; /* Preemptable RCU? */ + struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ + unsigned long grpmask; /* Mask to apply to leaf qsmask. */ + + /* 2) batch handling */ + /* + * If nxtlist is not NULL, it is partitioned as follows. + * Any of the partitions might be empty, in which case the + * pointer to that partition will be equal to the pointer for + * the following partition. When the list is empty, all of + * the nxttail elements point to nxtlist, which is NULL. + * + * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]): + * Entries that might have arrived after current GP ended + * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): + * Entries known to have arrived before current GP ended + * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): + * Entries that batch # <= ->completed - 1: waiting for current GP + * [nxtlist, *nxttail[RCU_DONE_TAIL]): + * Entries that batch # <= ->completed + * The grace period for these entries has completed, and + * the other grace-period-completed entries may be moved + * here temporarily in rcu_process_callbacks(). + */ + struct rcu_head *nxtlist; + struct rcu_head **nxttail[RCU_NEXT_SIZE]; + long qlen; /* # of queued callbacks */ + long blimit; /* Upper limit on a processed batch */ + +#ifdef CONFIG_NO_HZ + /* 3) dynticks interface. */ + struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ + int dynticks_snap; /* Per-GP tracking for dynticks. */ + int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */ +#endif /* #ifdef CONFIG_NO_HZ */ + + /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ +#ifdef CONFIG_NO_HZ + unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ +#endif /* #ifdef CONFIG_NO_HZ */ + unsigned long offline_fqs; /* Kicked due to being offline. */ + unsigned long resched_ipi; /* Sent a resched IPI. */ + + /* 5) __rcu_pending() statistics. */ + long n_rcu_pending; /* rcu_pending() calls since boot. */ + long n_rp_qs_pending; + long n_rp_cb_ready; + long n_rp_cpu_needs_gp; + long n_rp_gp_completed; + long n_rp_gp_started; + long n_rp_need_fqs; + long n_rp_need_nothing; + + int cpu; +}; + +/* Values for signaled field in struct rcu_state. */ +#define RCU_GP_INIT 0 /* Grace period being initialized. */ +#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ +#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ +#ifdef CONFIG_NO_HZ +#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK +#else /* #ifdef CONFIG_NO_HZ */ +#define RCU_SIGNAL_INIT RCU_FORCE_QS +#endif /* #else #ifdef CONFIG_NO_HZ */ + +#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ +#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ +#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ + /* to take at least one */ + /* scheduling clock irq */ + /* before ratting on them. */ + +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +/* + * RCU global state, including node hierarchy. This hierarchy is + * represented in "heap" form in a dense array. The root (first level) + * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second + * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]), + * and the third level in ->node[m+1] and following (->node[m+1] referenced + * by ->level[2]). The number of levels is determined by the number of + * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy" + * consisting of a single rcu_node. + */ +struct rcu_state { + struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ + struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ + u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ + u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ + struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ + + /* The following fields are guarded by the root rcu_node's lock. */ + + u8 signaled ____cacheline_internodealigned_in_smp; + /* Force QS state. */ + long gpnum; /* Current gp number. */ + long completed; /* # of last completed gp. */ + spinlock_t onofflock; /* exclude on/offline and */ + /* starting new GP. */ + spinlock_t fqslock; /* Only one task forcing */ + /* quiescent states. */ + unsigned long jiffies_force_qs; /* Time at which to invoke */ + /* force_quiescent_state(). */ + unsigned long n_force_qs; /* Number of calls to */ + /* force_quiescent_state(). */ + unsigned long n_force_qs_lh; /* ~Number of calls leaving */ + /* due to lock unavailable. */ + unsigned long n_force_qs_ngp; /* Number of calls leaving */ + /* due to no GP active. */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + unsigned long gp_start; /* Time at which GP started, */ + /* but in jiffies. */ + unsigned long jiffies_stall; /* Time at which to check */ + /* for CPU stalls. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +#ifdef CONFIG_NO_HZ + long dynticks_completed; /* Value of completed @ snap. */ +#endif /* #ifdef CONFIG_NO_HZ */ +}; + +#ifdef RCU_TREE_NONCORE /* * RCU implementation internal declarations: */ -extern struct rcu_state rcu_state; -DECLARE_PER_CPU(struct rcu_data, rcu_data); +extern struct rcu_state rcu_sched_state; +DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); extern struct rcu_state rcu_bh_state; DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); +#ifdef CONFIG_TREE_PREEMPT_RCU +extern struct rcu_state rcu_preempt_state; +DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +#endif /* #ifdef RCU_TREE_NONCORE */ + diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h new file mode 100644 index 0000000..1cee04f --- /dev/null +++ b/kernel/rcutree_plugin.h @@ -0,0 +1,566 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Internal non-public definitions that provide either classic + * or preemptable semantics. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright Red Hat, 2009 + * Copyright IBM Corporation, 2009 + * + * Author: Ingo Molnar <mingo@elte.hu> + * Paul E. McKenney <paulmck@linux.vnet.ibm.com> + */ + + +#ifdef CONFIG_TREE_PREEMPT_RCU + +struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); +DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); + +/* + * Tell them what RCU they are running. + */ +static inline void rcu_bootup_announce(void) +{ + printk(KERN_INFO + "Experimental preemptable hierarchical RCU implementation.\n"); +} + +/* + * Return the number of RCU-preempt batches processed thus far + * for debug and statistics. + */ +long rcu_batches_completed_preempt(void) +{ + return rcu_preempt_state.completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); + +/* + * Return the number of RCU batches processed thus far for debug & stats. + */ +long rcu_batches_completed(void) +{ + return rcu_batches_completed_preempt(); +} +EXPORT_SYMBOL_GPL(rcu_batches_completed); + +/* + * Record a preemptable-RCU quiescent state for the specified CPU. Note + * that this just means that the task currently running on the CPU is + * not in a quiescent state. There might be any number of tasks blocked + * while in an RCU read-side critical section. + */ +static void rcu_preempt_qs(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); + rdp->passed_quiesc_completed = rdp->completed; + barrier(); + rdp->passed_quiesc = 1; +} + +/* + * We have entered the scheduler, and the current task might soon be + * context-switched away from. If this task is in an RCU read-side + * critical section, we will no longer be able to rely on the CPU to + * record that fact, so we enqueue the task on the appropriate entry + * of the blocked_tasks[] array. The task will dequeue itself when + * it exits the outermost enclosing RCU read-side critical section. + * Therefore, the current grace period cannot be permitted to complete + * until the blocked_tasks[] entry indexed by the low-order bit of + * rnp->gpnum empties. + * + * Caller must disable preemption. + */ +static void rcu_preempt_note_context_switch(int cpu) +{ + struct task_struct *t = current; + unsigned long flags; + int phase; + struct rcu_data *rdp; + struct rcu_node *rnp; + + if (t->rcu_read_lock_nesting && + (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { + + /* Possibly blocking in an RCU read-side critical section. */ + rdp = rcu_preempt_state.rda[cpu]; + rnp = rdp->mynode; + spin_lock_irqsave(&rnp->lock, flags); + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; + t->rcu_blocked_node = rnp; + + /* + * If this CPU has already checked in, then this task + * will hold up the next grace period rather than the + * current grace period. Queue the task accordingly. + * If the task is queued for the current grace period + * (i.e., this CPU has not yet passed through a quiescent + * state for the current grace period), then as long + * as that task remains queued, the current grace period + * cannot end. + * + * But first, note that the current CPU must still be + * on line! + */ + WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); + WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); + phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; + list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); + spin_unlock_irqrestore(&rnp->lock, flags); + } + + /* + * Either we were not in an RCU read-side critical section to + * begin with, or we have now recorded that critical section + * globally. Either way, we can now note a quiescent state + * for this CPU. Again, if we were in an RCU read-side critical + * section, and if that critical section was blocking the current + * grace period, then the fact that the task has been enqueued + * means that we continue to block the current grace period. + */ + rcu_preempt_qs(cpu); + local_irq_save(flags); + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + local_irq_restore(flags); +} + +/* + * Tree-preemptable RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + ACCESS_ONCE(current->rcu_read_lock_nesting)++; + barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +static void rcu_read_unlock_special(struct task_struct *t) +{ + int empty; + unsigned long flags; + unsigned long mask; + struct rcu_node *rnp; + int special; + + /* NMI handlers cannot block and cannot safely manipulate state. */ + if (in_nmi()) + return; + + local_irq_save(flags); + + /* + * If RCU core is waiting for this CPU to exit critical section, + * let it know that we have done so. + */ + special = t->rcu_read_unlock_special; + if (special & RCU_READ_UNLOCK_NEED_QS) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + rcu_preempt_qs(smp_processor_id()); + } + + /* Hardware IRQ handlers cannot block. */ + if (in_irq()) { + local_irq_restore(flags); + return; + } + + /* Clean up if blocked during RCU read-side critical section. */ + if (special & RCU_READ_UNLOCK_BLOCKED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; + + /* + * Remove this task from the list it blocked on. The + * task can migrate while we acquire the lock, but at + * most one time. So at most two passes through loop. + */ + for (;;) { + rnp = t->rcu_blocked_node; + spin_lock(&rnp->lock); /* irqs already disabled. */ + if (rnp == t->rcu_blocked_node) + break; + spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); + list_del_init(&t->rcu_node_entry); + t->rcu_blocked_node = NULL; + + /* + * If this was the last task on the current list, and if + * we aren't waiting on any CPUs, report the quiescent state. + * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk() + * drop rnp->lock and restore irq. + */ + if (!empty && rnp->qsmask == 0 && + list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) { + struct rcu_node *rnp_p; + + if (rnp->parent == NULL) { + /* Only one rcu_node in the tree. */ + cpu_quiet_msk_finish(&rcu_preempt_state, flags); + return; + } + /* Report up the rest of the hierarchy. */ + mask = rnp->grpmask; + spin_unlock_irqrestore(&rnp->lock, flags); + rnp_p = rnp->parent; + spin_lock_irqsave(&rnp_p->lock, flags); + WARN_ON_ONCE(rnp->qsmask); + cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); + return; + } + spin_unlock(&rnp->lock); + } + local_irq_restore(flags); +} + +/* + * Tree-preemptable RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ + if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && + unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each. + */ +static void rcu_print_task_stall(struct rcu_node *rnp) +{ + unsigned long flags; + struct list_head *lp; + int phase = rnp->gpnum & 0x1; + struct task_struct *t; + + if (!list_empty(&rnp->blocked_tasks[phase])) { + spin_lock_irqsave(&rnp->lock, flags); + phase = rnp->gpnum & 0x1; /* re-read under lock. */ + lp = &rnp->blocked_tasks[phase]; + list_for_each_entry(t, lp, rcu_node_entry) + printk(" P%d", t->pid); + spin_unlock_irqrestore(&rnp->lock, flags); + } +} + +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +/* + * Check that the list of blocked tasks for the newly completed grace + * period is in fact empty. It is a serious bug to complete a grace + * period that still has RCU readers blocked! This function must be + * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock + * must be held by the caller. + */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +{ + WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])); + WARN_ON_ONCE(rnp->qsmask); +} + +/* + * Check for preempted RCU readers for the specified rcu_node structure. + * If the caller needs a reliable answer, it must hold the rcu_node's + * >lock. + */ +static int rcu_preempted_readers(struct rcu_node *rnp) +{ + return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Handle tasklist migration for case in which all CPUs covered by the + * specified rcu_node have gone offline. Move them up to the root + * rcu_node. The reason for not just moving them to the immediate + * parent is to remove the need for rcu_read_unlock_special() to + * make more than two attempts to acquire the target rcu_node's lock. + * + * The caller must hold rnp->lock with irqs disabled. + */ +static void rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) +{ + int i; + struct list_head *lp; + struct list_head *lp_root; + struct rcu_node *rnp_root = rcu_get_root(rsp); + struct task_struct *tp; + + if (rnp == rnp_root) { + WARN_ONCE(1, "Last CPU thought to be offlined?"); + return; /* Shouldn't happen: at least one CPU online. */ + } + WARN_ON_ONCE(rnp != rdp->mynode && + (!list_empty(&rnp->blocked_tasks[0]) || + !list_empty(&rnp->blocked_tasks[1]))); + + /* + * Move tasks up to root rcu_node. Rely on the fact that the + * root rcu_node can be at most one ahead of the rest of the + * rcu_nodes in terms of gp_num value. This fact allows us to + * move the blocked_tasks[] array directly, element by element. + */ + for (i = 0; i < 2; i++) { + lp = &rnp->blocked_tasks[i]; + lp_root = &rnp_root->blocked_tasks[i]; + while (!list_empty(lp)) { + tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); + spin_lock(&rnp_root->lock); /* irqs already disabled */ + list_del(&tp->rcu_node_entry); + tp->rcu_blocked_node = rnp_root; + list_add(&tp->rcu_node_entry, lp_root); + spin_unlock(&rnp_root->lock); /* irqs remain disabled */ + } + } +} + +/* + * Do CPU-offline processing for preemptable RCU. + */ +static void rcu_preempt_offline_cpu(int cpu) +{ + __rcu_offline_cpu(cpu, &rcu_preempt_state); +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Check for a quiescent state from the current CPU. When a task blocks, + * the task is recorded in the corresponding CPU's rcu_node structure, + * which is checked elsewhere. + * + * Caller must disable hard irqs. + */ +static void rcu_preempt_check_callbacks(int cpu) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting == 0) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + rcu_preempt_qs(cpu); + return; + } + if (per_cpu(rcu_preempt_data, cpu).qs_pending) + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; +} + +/* + * Process callbacks for preemptable RCU. + */ +static void rcu_preempt_process_callbacks(void) +{ + __rcu_process_callbacks(&rcu_preempt_state, + &__get_cpu_var(rcu_preempt_data)); +} + +/* + * Queue a preemptable-RCU callback for invocation after a grace period. + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_preempt_state); +} +EXPORT_SYMBOL_GPL(call_rcu); + +/* + * Check to see if there is any immediate preemptable-RCU-related work + * to be done. + */ +static int rcu_preempt_pending(int cpu) +{ + return __rcu_pending(&rcu_preempt_state, + &per_cpu(rcu_preempt_data, cpu)); +} + +/* + * Does preemptable RCU need the CPU to stay out of dynticks mode? + */ +static int rcu_preempt_needs_cpu(int cpu) +{ + return !!per_cpu(rcu_preempt_data, cpu).nxtlist; +} + +/* + * Initialize preemptable RCU's per-CPU data. + */ +static void __cpuinit rcu_preempt_init_percpu_data(int cpu) +{ + rcu_init_percpu_data(cpu, &rcu_preempt_state, 1); +} + +/* + * Check for a task exiting while in a preemptable-RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting == 0) + return; + t->rcu_read_lock_nesting = 1; + rcu_read_unlock(); +} + +#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +/* + * Tell them what RCU they are running. + */ +static inline void rcu_bootup_announce(void) +{ + printk(KERN_INFO "Hierarchical RCU implementation.\n"); +} + +/* + * Return the number of RCU batches processed thus far for debug & stats. + */ +long rcu_batches_completed(void) +{ + return rcu_batches_completed_sched(); +} +EXPORT_SYMBOL_GPL(rcu_batches_completed); + +/* + * Because preemptable RCU does not exist, we never have to check for + * CPUs being in quiescent states. + */ +static void rcu_preempt_note_context_switch(int cpu) +{ +} + +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +/* + * Because preemptable RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static void rcu_print_task_stall(struct rcu_node *rnp) +{ +} + +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +/* + * Because there is no preemptable RCU, there can be no readers blocked, + * so there is no need to check for blocked tasks. So check only for + * bogus qsmask values. + */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +{ + WARN_ON_ONCE(rnp->qsmask); +} + +/* + * Because preemptable RCU does not exist, there are never any preempted + * RCU readers. + */ +static int rcu_preempted_readers(struct rcu_node *rnp) +{ + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Because preemptable RCU does not exist, it never needs to migrate + * tasks that were blocked within RCU read-side critical sections. + */ +static void rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) +{ +} + +/* + * Because preemptable RCU does not exist, it never needs CPU-offline + * processing. + */ +static void rcu_preempt_offline_cpu(int cpu) +{ +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Because preemptable RCU does not exist, it never has any callbacks + * to check. + */ +void rcu_preempt_check_callbacks(int cpu) +{ +} + +/* + * Because preemptable RCU does not exist, it never has any callbacks + * to process. + */ +void rcu_preempt_process_callbacks(void) +{ +} + +/* + * In classic RCU, call_rcu() is just call_rcu_sched(). + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + call_rcu_sched(head, func); +} +EXPORT_SYMBOL_GPL(call_rcu); + +/* + * Because preemptable RCU does not exist, it never has any work to do. + */ +static int rcu_preempt_pending(int cpu) +{ + return 0; +} + +/* + * Because preemptable RCU does not exist, it never needs any CPU. + */ +static int rcu_preempt_needs_cpu(int cpu) +{ + return 0; +} + +/* + * Because preemptable RCU does not exist, there is no per-CPU + * data to initialize. + */ +static void __cpuinit rcu_preempt_init_percpu_data(int cpu) +{ +} + +#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index fe1dcdb..c89f5e9 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -20,7 +20,7 @@ * Papers: http://www.rdrop.com/users/paulmck/RCU * * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU + * Documentation/RCU * */ #include <linux/types.h> @@ -43,6 +43,7 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> +#define RCU_TREE_NONCORE #include "rcutree.h" static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) @@ -76,8 +77,12 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) static int show_rcudata(struct seq_file *m, void *unused) { - seq_puts(m, "rcu:\n"); - PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m); +#ifdef CONFIG_TREE_PREEMPT_RCU + seq_puts(m, "rcu_preempt:\n"); + PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + seq_puts(m, "rcu_sched:\n"); + PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m); seq_puts(m, "rcu_bh:\n"); PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); return 0; @@ -102,7 +107,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) return; seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", rdp->cpu, - cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"", + cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", rdp->completed, rdp->gpnum, rdp->passed_quiesc, rdp->passed_quiesc_completed, rdp->qs_pending); @@ -124,8 +129,12 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); - seq_puts(m, "\"rcu:\"\n"); - PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m); +#ifdef CONFIG_TREE_PREEMPT_RCU + seq_puts(m, "\"rcu_preempt:\"\n"); + PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + seq_puts(m, "\"rcu_sched:\"\n"); + PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m); seq_puts(m, "\"rcu_bh:\"\n"); PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); return 0; @@ -171,8 +180,12 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) static int show_rcuhier(struct seq_file *m, void *unused) { - seq_puts(m, "rcu:\n"); - print_one_rcu_state(m, &rcu_state); +#ifdef CONFIG_TREE_PREEMPT_RCU + seq_puts(m, "rcu_preempt:\n"); + print_one_rcu_state(m, &rcu_preempt_state); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + seq_puts(m, "rcu_sched:\n"); + print_one_rcu_state(m, &rcu_sched_state); seq_puts(m, "rcu_bh:\n"); print_one_rcu_state(m, &rcu_bh_state); return 0; @@ -193,8 +206,12 @@ static struct file_operations rcuhier_fops = { static int show_rcugp(struct seq_file *m, void *unused) { - seq_printf(m, "rcu: completed=%ld gpnum=%ld\n", - rcu_state.completed, rcu_state.gpnum); +#ifdef CONFIG_TREE_PREEMPT_RCU + seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n", + rcu_preempt_state.completed, rcu_preempt_state.gpnum); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n", + rcu_sched_state.completed, rcu_sched_state.gpnum); seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", rcu_bh_state.completed, rcu_bh_state.gpnum); return 0; @@ -243,8 +260,12 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) static int show_rcu_pending(struct seq_file *m, void *unused) { - seq_puts(m, "rcu:\n"); - print_rcu_pendings(m, &rcu_state); +#ifdef CONFIG_TREE_PREEMPT_RCU + seq_puts(m, "rcu_preempt:\n"); + print_rcu_pendings(m, &rcu_preempt_state); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + seq_puts(m, "rcu_sched:\n"); + print_rcu_pendings(m, &rcu_sched_state); seq_puts(m, "rcu_bh:\n"); print_rcu_pendings(m, &rcu_bh_state); return 0; @@ -264,62 +285,47 @@ static struct file_operations rcu_pending_fops = { }; static struct dentry *rcudir; -static struct dentry *datadir; -static struct dentry *datadir_csv; -static struct dentry *gpdir; -static struct dentry *hierdir; -static struct dentry *rcu_pendingdir; static int __init rcuclassic_trace_init(void) { + struct dentry *retval; + rcudir = debugfs_create_dir("rcu", NULL); if (!rcudir) - goto out; + goto free_out; - datadir = debugfs_create_file("rcudata", 0444, rcudir, + retval = debugfs_create_file("rcudata", 0444, rcudir, NULL, &rcudata_fops); - if (!datadir) + if (!retval) goto free_out; - datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir, + retval = debugfs_create_file("rcudata.csv", 0444, rcudir, NULL, &rcudata_csv_fops); - if (!datadir_csv) + if (!retval) goto free_out; - gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); - if (!gpdir) + retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); + if (!retval) goto free_out; - hierdir = debugfs_create_file("rcuhier", 0444, rcudir, + retval = debugfs_create_file("rcuhier", 0444, rcudir, NULL, &rcuhier_fops); - if (!hierdir) + if (!retval) goto free_out; - rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir, + retval = debugfs_create_file("rcu_pending", 0444, rcudir, NULL, &rcu_pending_fops); - if (!rcu_pendingdir) + if (!retval) goto free_out; return 0; free_out: - if (datadir) - debugfs_remove(datadir); - if (datadir_csv) - debugfs_remove(datadir_csv); - if (gpdir) - debugfs_remove(gpdir); - debugfs_remove(rcudir); -out: + debugfs_remove_recursive(rcudir); return 1; } static void __exit rcuclassic_trace_cleanup(void) { - debugfs_remove(datadir); - debugfs_remove(datadir_csv); - debugfs_remove(gpdir); - debugfs_remove(hierdir); - debugfs_remove(rcu_pendingdir); - debugfs_remove(rcudir); + debugfs_remove_recursive(rcudir); } diff --git a/kernel/resource.c b/kernel/resource.c index 78b0872..fb11a58 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -223,13 +223,13 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY) +#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* * Finds the lowest memory reosurce exists within [res->start.res->end) - * the caller must specify res->start, res->end, res->flags. + * the caller must specify res->start, res->end, res->flags and "name". * If found, returns 0, res is overwritten, if not found, returns -1. */ -static int find_next_system_ram(struct resource *res) +static int find_next_system_ram(struct resource *res, char *name) { resource_size_t start, end; struct resource *p; @@ -245,6 +245,8 @@ static int find_next_system_ram(struct resource *res) /* system ram is just marked as IORESOURCE_MEM */ if (p->flags != res->flags) continue; + if (name && strcmp(p->name, name)) + continue; if (p->start > end) { p = NULL; break; @@ -262,19 +264,26 @@ static int find_next_system_ram(struct resource *res) res->end = p->end; return 0; } -int -walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, - int (*func)(unsigned long, unsigned long, void *)) + +/* + * This function calls callback against all memory range of "System RAM" + * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. + * Now, this function is only for "System RAM". + */ +int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, + void *arg, int (*func)(unsigned long, unsigned long, void *)) { struct resource res; unsigned long pfn, len; u64 orig_end; int ret = -1; + res.start = (u64) start_pfn << PAGE_SHIFT; res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; orig_end = res.end; - while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { + while ((res.start < res.end) && + (find_next_system_ram(&res, "System RAM") >= 0)) { pfn = (unsigned long)(res.start >> PAGE_SHIFT); len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); ret = (*func)(pfn, len, arg); diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e26..2f76e06b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,7 +39,7 @@ #include <linux/completion.h> #include <linux/kernel_stat.h> #include <linux/debug_locks.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/profile.h> @@ -64,7 +64,6 @@ #include <linux/tsacct_kern.h> #include <linux/kprobes.h> #include <linux/delayacct.h> -#include <linux/reciprocal_div.h> #include <linux/unistd.h> #include <linux/pagemap.h> #include <linux/hrtimer.h> @@ -120,30 +119,6 @@ */ #define RUNTIME_INF ((u64)~0ULL) -#ifdef CONFIG_SMP - -static void double_rq_lock(struct rq *rq1, struct rq *rq2); - -/* - * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) - * Since cpu_power is a 'constant', we can use a reciprocal divide. - */ -static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) -{ - return reciprocal_divide(load, sg->reciprocal_cpu_power); -} - -/* - * Each time a sched group cpu_power is changed, - * we must compute its reciprocal value - */ -static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) -{ - sg->__cpu_power += val; - sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); -} -#endif - static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) @@ -309,8 +284,8 @@ void set_tg_uid(struct user_struct *user) /* * Root task group. - * Every UID task group (including init_task_group aka UID-0) will - * be a child to this group. + * Every UID task group (including init_task_group aka UID-0) will + * be a child to this group. */ struct task_group root_task_group; @@ -318,12 +293,12 @@ struct task_group root_task_group; /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); #endif /* CONFIG_RT_GROUP_SCHED */ #else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group @@ -401,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #else -#ifdef CONFIG_SMP -static int root_task_group_empty(void) -{ - return 1; -} -#endif - static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } static inline struct task_group *task_group(struct task_struct *p) { @@ -537,14 +505,6 @@ struct root_domain { #ifdef CONFIG_SMP struct cpupri cpupri; #endif -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Preferred wake up cpu nominated by sched_mc balance that will be - * used when most cpus are idle in the system indicating overall very - * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) - */ - unsigned int sched_mc_preferred_wakeup_cpu; -#endif }; /* @@ -616,6 +576,7 @@ struct rq { unsigned char idle_at_tick; /* For active balancing */ + int post_schedule; int active_balance; int push_cpu; /* cpu of this runqueue: */ @@ -626,6 +587,9 @@ struct rq { struct task_struct *migration_thread; struct list_head migration_queue; + + u64 rt_avg; + u64 age_stamp; #endif /* calc_load related fields */ @@ -665,9 +629,10 @@ struct rq { static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) +static inline +void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { - rq->curr->sched_class->check_preempt_curr(rq, p, sync); + rq->curr->sched_class->check_preempt_curr(rq, p, flags); } static inline int cpu_of(struct rq *rq) @@ -693,6 +658,7 @@ static inline int cpu_of(struct rq *rq) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() (&__raw_get_cpu_var(runqueues)) inline void update_rq_clock(struct rq *rq) { @@ -715,15 +681,9 @@ inline void update_rq_clock(struct rq *rq) * This interface allows printk to be called with the runqueue lock * held and know whether or not it is OK to wake up the klogd. */ -int runqueue_is_locked(void) +int runqueue_is_locked(int cpu) { - int cpu = get_cpu(); - struct rq *rq = cpu_rq(cpu); - int ret; - - ret = spin_is_locked(&rq->lock); - put_cpu(); - return ret; + return spin_is_locked(&cpu_rq(cpu)->lock); } /* @@ -861,6 +821,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000; unsigned int sysctl_sched_shares_thresh = 4; /* + * period over which we average the RT time consumption, measured + * in ms. + * + * default: 1s + */ +const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; + +/* * period over which we measure -rt task cpu usage in us. * default: 1s */ @@ -1278,12 +1246,37 @@ void wake_up_idle_cpu(int cpu) } #endif /* CONFIG_NO_HZ */ +static u64 sched_avg_period(void) +{ + return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; +} + +static void sched_avg_update(struct rq *rq) +{ + s64 period = sched_avg_period(); + + while ((s64)(rq->clock - rq->age_stamp) > period) { + rq->age_stamp += period; + rq->rt_avg /= 2; + } +} + +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ + rq->rt_avg += rt_delta; + sched_avg_update(rq); +} + #else /* !CONFIG_SMP */ static void resched_task(struct task_struct *p) { assert_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } + +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ +} #endif /* CONFIG_SMP */ #if BITS_PER_LONG == 32 @@ -1494,8 +1487,65 @@ static int tg_nop(struct task_group *tg, void *data) #endif #ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->load.weight; +} + +/* + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static unsigned long source_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return min(rq->cpu_load[type-1], total); +} + +/* + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. + */ +static unsigned long target_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return max(rq->cpu_load[type-1], total); +} + +static struct sched_group *group_of(int cpu) +{ + struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); + + if (!sd) + return NULL; + + return sd->groups; +} + +static unsigned long power_of(int cpu) +{ + struct sched_group *group = group_of(cpu); + + if (!group) + return SCHED_LOAD_SCALE; + + return group->cpu_power; +} + static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); static unsigned long cpu_avg_load_per_task(int cpu) @@ -1513,28 +1563,35 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED +struct update_shares_data { + unsigned long rq_weight[NR_CPUS]; +}; + +static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); + static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* * Calculate and set the cpu's group shares. */ -static void -update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight) +static void update_group_shares_cpu(struct task_group *tg, int cpu, + unsigned long sd_shares, + unsigned long sd_rq_weight, + struct update_shares_data *usd) { - unsigned long shares; - unsigned long rq_weight; - - if (!tg->se[cpu]) - return; + unsigned long shares, rq_weight; + int boost = 0; - rq_weight = tg->cfs_rq[cpu]->rq_weight; + rq_weight = usd->rq_weight[cpu]; + if (!rq_weight) { + boost = 1; + rq_weight = NICE_0_LOAD; + } /* - * \Sum shares * rq_weight - * shares = ----------------------- - * \Sum rq_weight - * + * \Sum_j shares_j * rq_weight_i + * shares_i = ----------------------------- + * \Sum_j rq_weight_j */ shares = (sd_shares * rq_weight) / sd_rq_weight; shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); @@ -1545,8 +1602,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - tg->cfs_rq[cpu]->shares = shares; - + tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; __set_se_shares(tg->se[cpu], shares); spin_unlock_irqrestore(&rq->lock, flags); } @@ -1559,22 +1616,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0; - unsigned long shares = 0; + unsigned long weight, rq_weight = 0, shares = 0; + struct update_shares_data *usd; struct sched_domain *sd = data; + unsigned long flags; int i; + if (!tg->se[0]) + return 0; + + local_irq_save(flags); + usd = &__get_cpu_var(update_shares_data); + for_each_cpu(i, sched_domain_span(sd)) { + weight = tg->cfs_rq[i]->load.weight; + usd->rq_weight[i] = weight; + /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to * run here it will not get delayed by group starvation. */ - weight = tg->cfs_rq[i]->load.weight; if (!weight) weight = NICE_0_LOAD; - tg->cfs_rq[i]->rq_weight = weight; rq_weight += weight; shares += tg->cfs_rq[i]->shares; } @@ -1586,7 +1651,9 @@ static int tg_shares_up(struct task_group *tg, void *data) shares = tg->shares; for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight); + update_group_shares_cpu(tg, i, shares, rq_weight, usd); + + local_irq_restore(flags); return 0; } @@ -1616,8 +1683,14 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_shares(struct sched_domain *sd) { - u64 now = cpu_clock(raw_smp_processor_id()); - s64 elapsed = now - sd->last_update; + s64 elapsed; + u64 now; + + if (root_task_group_empty()) + return; + + now = cpu_clock(raw_smp_processor_id()); + elapsed = now - sd->last_update; if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; @@ -1627,6 +1700,9 @@ static void update_shares(struct sched_domain *sd) static void update_shares_locked(struct rq *rq, struct sched_domain *sd) { + if (root_task_group_empty()) + return; + spin_unlock(&rq->lock); update_shares(sd); spin_lock(&rq->lock); @@ -1634,6 +1710,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) static void update_h_load(long cpu) { + if (root_task_group_empty()) + return; + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } @@ -1651,6 +1730,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #ifdef CONFIG_PREEMPT +static void double_rq_lock(struct rq *rq1, struct rq *rq2); + /* * fair double_lock_balance: Safely acquires both rq->locks in a fair * way at the expense of forcing extra atomic operations in all @@ -1915,13 +1996,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, } #ifdef CONFIG_SMP - -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->load.weight; -} - /* * Is this task likely cache-hot: */ @@ -1979,7 +2053,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); #endif - perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } p->se.vruntime -= old_cfsrq->min_vruntime - @@ -2195,186 +2269,6 @@ void kick_process(struct task_struct *p) preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); - -/* - * Return a low guess at the load of a migration-source cpu weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target cpu weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return max(rq->cpu_load[type-1], total); -} - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - */ -static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) -{ - struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; - unsigned long min_load = ULONG_MAX, this_load = 0; - int load_idx = sd->forkexec_idx; - int imbalance = 100 + (sd->imbalance_pct-100)/2; - - do { - unsigned long load, avg_load; - int local_group; - int i; - - /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_cpus(group), - &p->cpus_allowed)) - continue; - - local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); - - /* Tally up the load of all CPUs in the group */ - avg_load = 0; - - for_each_cpu(i, sched_group_cpus(group)) { - /* Bias balancing toward cpus of our domain */ - if (local_group) - load = source_load(i, load_idx); - else - load = target_load(i, load_idx); - - avg_load += load; - } - - /* Adjust by relative CPU power of the group */ - avg_load = sg_div_cpu_power(group, - avg_load * SCHED_LOAD_SCALE); - - if (local_group) { - this_load = avg_load; - this = group; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; - } - } while (group = group->next, group != sd->groups); - - if (!idlest || 100*this_load < imbalance*min_load) - return NULL; - return idlest; -} - -/* - * find_idlest_cpu - find the idlest cpu among the cpus in group. - */ -static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) -{ - unsigned long load, min_load = ULONG_MAX; - int idlest = -1; - int i; - - /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { - load = weighted_cpuload(i); - - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - idlest = i; - } - } - - return idlest; -} - -/* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. - * - * Balance, ie. select the least loaded group. - * - * Returns the target CPU number, or the same CPU if no balancing is needed. - * - * preempt must be disabled. - */ -static int sched_balance_self(int cpu, int flag) -{ - struct task_struct *t = current; - struct sched_domain *tmp, *sd = NULL; - - for_each_domain(cpu, tmp) { - /* - * If power savings logic is enabled for a domain, stop there. - */ - if (tmp->flags & SD_POWERSAVINGS_BALANCE) - break; - if (tmp->flags & flag) - sd = tmp; - } - - if (sd) - update_shares(sd); - - while (sd) { - struct sched_group *group; - int new_cpu, weight; - - if (!(sd->flags & flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, t, cpu); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, t, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = cpumask_weight(sched_domain_span(sd)); - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= cpumask_weight(sched_domain_span(tmp))) - break; - if (tmp->flags & flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ - } - - return cpu; -} - #endif /* CONFIG_SMP */ /** @@ -2412,37 +2306,22 @@ void task_oncpu_function_call(struct task_struct *p, * * returns failure only if the task is already active. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) +static int try_to_wake_up(struct task_struct *p, unsigned int state, + int wake_flags) { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - long old_state; struct rq *rq; if (!sched_feat(SYNC_WAKEUPS)) - sync = 0; - -#ifdef CONFIG_SMP - if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { - struct sched_domain *sd; + wake_flags &= ~WF_SYNC; - this_cpu = raw_smp_processor_id(); - cpu = task_cpu(p); - - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - update_shares(sd); - break; - } - } - } -#endif + this_cpu = get_cpu(); smp_wmb(); rq = task_rq_lock(p, &flags); update_rq_clock(rq); - old_state = p->state; - if (!(old_state & state)) + if (!(p->state & state)) goto out; if (p->se.on_rq) @@ -2450,27 +2329,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) cpu = task_cpu(p); orig_cpu = cpu; - this_cpu = smp_processor_id(); #ifdef CONFIG_SMP if (unlikely(task_running(rq, p))) goto out_activate; - cpu = p->sched_class->select_task_rq(p, sync); - if (cpu != orig_cpu) { + /* + * In order to handle concurrent wakeups and release the rq->lock + * we put the task in TASK_WAKING state. + * + * First fix up the nr_uninterruptible count: + */ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible--; + p->state = TASK_WAKING; + task_rq_unlock(rq, &flags); + + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + if (cpu != orig_cpu) set_task_cpu(p, cpu); - task_rq_unlock(rq, &flags); - /* might preempt at this point */ - rq = task_rq_lock(p, &flags); - old_state = p->state; - if (!(old_state & state)) - goto out; - if (p->se.on_rq) - goto out_running; - this_cpu = smp_processor_id(); - cpu = task_cpu(p); - } + rq = task_rq_lock(p, &flags); + WARN_ON(p->state != TASK_WAKING); + cpu = task_cpu(p); #ifdef CONFIG_SCHEDSTATS schedstat_inc(rq, ttwu_count); @@ -2490,7 +2371,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ schedstat_inc(p, se.nr_wakeups); - if (sync) + if (wake_flags & WF_SYNC) schedstat_inc(p, se.nr_wakeups_sync); if (orig_cpu != cpu) schedstat_inc(p, se.nr_wakeups_migrate); @@ -2519,7 +2400,7 @@ out_activate: out_running: trace_sched_wakeup(rq, p, success); - check_preempt_curr(rq, p, sync); + check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -2528,6 +2409,7 @@ out_running: #endif out: task_rq_unlock(rq, &flags); + put_cpu(); return success; } @@ -2570,6 +2452,7 @@ static void __sched_fork(struct task_struct *p) p->se.avg_overlap = 0; p->se.start_runtime = 0; p->se.avg_wakeup = sysctl_sched_wakeup_granularity; + p->se.avg_running = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -2631,18 +2514,41 @@ void sched_fork(struct task_struct *p, int clone_flags) __sched_fork(p); -#ifdef CONFIG_SMP - cpu = sched_balance_self(cpu, SD_BALANCE_FORK); -#endif - set_task_cpu(p, cpu); - /* - * Make sure we do not leak PI boosting priority to the child: + * Make sure we do not leak PI boosting priority to the child. */ p->prio = current->normal_prio; + + /* + * Revert to default priority/policy on fork if requested. + */ + if (unlikely(p->sched_reset_on_fork)) { + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) + p->policy = SCHED_NORMAL; + + if (p->normal_prio < DEFAULT_PRIO) + p->prio = DEFAULT_PRIO; + + if (PRIO_TO_NICE(p->static_prio) < 0) { + p->static_prio = NICE_TO_PRIO(0); + set_load_weight(p); + } + + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: + */ + p->sched_reset_on_fork = 0; + } + if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; +#ifdef CONFIG_SMP + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); +#endif + set_task_cpu(p, cpu); + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -2688,7 +2594,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) inc_nr_running(rq); } trace_sched_wakeup_new(rq, p, 1); - check_preempt_curr(rq, p, 0); + check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@ -2796,12 +2702,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) { struct mm_struct *mm = rq->prev_mm; long prev_state; -#ifdef CONFIG_SMP - int post_schedule = 0; - - if (current->sched_class->needs_post_schedule) - post_schedule = current->sched_class->needs_post_schedule(rq); -#endif rq->prev_mm = NULL; @@ -2818,12 +2718,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); - perf_counter_task_sched_in(current, cpu_of(rq)); + perf_event_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); -#ifdef CONFIG_SMP - if (post_schedule) - current->sched_class->post_schedule(rq); -#endif fire_sched_in_preempt_notifiers(current); if (mm) @@ -2838,6 +2734,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) } } +#ifdef CONFIG_SMP + +/* assumes rq->lock is held */ +static inline void pre_schedule(struct rq *rq, struct task_struct *prev) +{ + if (prev->sched_class->pre_schedule) + prev->sched_class->pre_schedule(rq, prev); +} + +/* rq->lock is NOT held, but preemption is disabled */ +static inline void post_schedule(struct rq *rq) +{ + if (rq->post_schedule) { + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + if (rq->curr->sched_class->post_schedule) + rq->curr->sched_class->post_schedule(rq); + spin_unlock_irqrestore(&rq->lock, flags); + + rq->post_schedule = 0; + } +} + +#else + +static inline void pre_schedule(struct rq *rq, struct task_struct *p) +{ +} + +static inline void post_schedule(struct rq *rq) +{ +} + +#endif + /** * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. @@ -2848,6 +2780,13 @@ asmlinkage void schedule_tail(struct task_struct *prev) struct rq *rq = this_rq(); finish_task_switch(rq, prev); + + /* + * FIXME: do we need to worry about rq being invalidated by the + * task_switch? + */ + post_schedule(rq); + #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); @@ -2965,6 +2904,19 @@ unsigned long nr_iowait(void) return sum; } +unsigned long nr_iowait_cpu(void) +{ + struct rq *this = this_rq(); + return atomic_read(&this->nr_iowait); +} + +unsigned long this_cpu_load(void) +{ + struct rq *this = this_rq(); + return this->cpu_load[0]; +} + + /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; @@ -3164,7 +3116,7 @@ out: void sched_exec(void) { int new_cpu, this_cpu = get_cpu(); - new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); + new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); put_cpu(); if (new_cpu != this_cpu) sched_migrate_task(current, new_cpu); @@ -3379,9 +3331,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, { const struct sched_class *class; - for (class = sched_class_highest; class; class = class->next) + for_each_class(class) { if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) return 1; + } return 0; } @@ -3544,7 +3497,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group, * capacity but still has some space to pick up some load * from other group and save more power */ - if (sgs->sum_nr_running > sgs->group_capacity - 1) + if (sgs->sum_nr_running + 1 > sgs->group_capacity) return; if (sgs->sum_nr_running > sds->leader_nr_running || @@ -3583,11 +3536,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, *imbalance = sds->min_load_per_task; sds->busiest = sds->group_min; - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(sds->group_leader); - } - return 1; } @@ -3612,6 +3560,102 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return SCHED_LOAD_SCALE; +} + +unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return default_scale_freq_power(sd, cpu); +} + +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +{ + unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long smt_gain = sd->smt_gain; + + smt_gain /= weight; + + return smt_gain; +} + +unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + return default_scale_smt_power(sd, cpu); +} + +unsigned long scale_rt_power(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 total, available; + + sched_avg_update(rq); + + total = sched_avg_period() + (rq->clock - rq->age_stamp); + available = total - rq->rt_avg; + + if (unlikely((s64)total < SCHED_LOAD_SCALE)) + total = SCHED_LOAD_SCALE; + + total >>= SCHED_LOAD_SHIFT; + + return div_u64(available, total); +} + +static void update_cpu_power(struct sched_domain *sd, int cpu) +{ + unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long power = SCHED_LOAD_SCALE; + struct sched_group *sdg = sd->groups; + + if (sched_feat(ARCH_POWER)) + power *= arch_scale_freq_power(sd, cpu); + else + power *= default_scale_freq_power(sd, cpu); + + power >>= SCHED_LOAD_SHIFT; + + if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { + if (sched_feat(ARCH_POWER)) + power *= arch_scale_smt_power(sd, cpu); + else + power *= default_scale_smt_power(sd, cpu); + + power >>= SCHED_LOAD_SHIFT; + } + + power *= scale_rt_power(cpu); + power >>= SCHED_LOAD_SHIFT; + + if (!power) + power = 1; + + sdg->cpu_power = power; +} + +static void update_group_power(struct sched_domain *sd, int cpu) +{ + struct sched_domain *child = sd->child; + struct sched_group *group, *sdg = sd->groups; + unsigned long power; + + if (!child) { + update_cpu_power(sd, cpu); + return; + } + + power = 0; + + group = child->groups; + do { + power += group->cpu_power; + group = group->next; + } while (group != child->groups); + + sdg->cpu_power = power; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @group: sched_group whose statistics are to be updated. @@ -3624,7 +3668,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ -static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, +static inline void update_sg_lb_stats(struct sched_domain *sd, + struct sched_group *group, int this_cpu, enum cpu_idle_type idle, int load_idx, int *sd_idle, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) @@ -3635,8 +3680,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; - if (local_group) + if (local_group) { balance_cpu = group_first_cpu(group); + if (balance_cpu == this_cpu) + update_group_power(sd, this_cpu); + } /* Tally up the load of all CPUs in the group */ sum_avg_load_per_task = avg_load_per_task = 0; @@ -3685,8 +3733,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, } /* Adjust by relative CPU power of the group */ - sgs->avg_load = sg_div_cpu_power(group, - sgs->group_load * SCHED_LOAD_SCALE); + sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; /* @@ -3698,14 +3745,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, * normalized nr_running number somewhere that negates * the hierarchy? */ - avg_load_per_task = sg_div_cpu_power(group, - sum_avg_load_per_task * SCHED_LOAD_SCALE); + avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / + group->cpu_power; if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) sgs->group_imb = 1; - sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; - + sgs->group_capacity = + DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); } /** @@ -3723,9 +3770,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, const struct cpumask *cpus, int *balance, struct sd_lb_stats *sds) { + struct sched_domain *child = sd->child; struct sched_group *group = sd->groups; struct sg_lb_stats sgs; - int load_idx; + int load_idx, prefer_sibling = 0; + + if (child && child->flags & SD_PREFER_SIBLING) + prefer_sibling = 1; init_sd_power_savings_stats(sd, sds, idle); load_idx = get_sd_load_idx(sd, idle); @@ -3736,14 +3787,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs); if (local_group && balance && !(*balance)) return; sds->total_load += sgs.group_load; - sds->total_pwr += group->__cpu_power; + sds->total_pwr += group->cpu_power; + + /* + * In case the child domain prefers tasks go to siblings + * first, lower the group capacity to one so that we'll try + * and move all the excess tasks away. + */ + if (prefer_sibling) + sgs.group_capacity = min(sgs.group_capacity, 1UL); if (local_group) { sds->this_load = sgs.avg_load; @@ -3763,7 +3822,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, update_sd_power_savings_stats(group, sds, local_group, &sgs); group = group->next; } while (group != sd->groups); - } /** @@ -3801,28 +3859,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, * moving them. */ - pwr_now += sds->busiest->__cpu_power * + pwr_now += sds->busiest->cpu_power * min(sds->busiest_load_per_task, sds->max_load); - pwr_now += sds->this->__cpu_power * + pwr_now += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = sg_div_cpu_power(sds->busiest, - sds->busiest_load_per_task * SCHED_LOAD_SCALE); + tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + sds->busiest->cpu_power; if (sds->max_load > tmp) - pwr_move += sds->busiest->__cpu_power * + pwr_move += sds->busiest->cpu_power * min(sds->busiest_load_per_task, sds->max_load - tmp); /* Amount of load we'd add */ - if (sds->max_load * sds->busiest->__cpu_power < + if (sds->max_load * sds->busiest->cpu_power < sds->busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = sg_div_cpu_power(sds->this, - sds->max_load * sds->busiest->__cpu_power); + tmp = (sds->max_load * sds->busiest->cpu_power) / + sds->this->cpu_power; else - tmp = sg_div_cpu_power(sds->this, - sds->busiest_load_per_task * SCHED_LOAD_SCALE); - pwr_move += sds->this->__cpu_power * + tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + sds->this->cpu_power; + pwr_move += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; @@ -3857,8 +3915,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, sds->max_load - sds->busiest_load_per_task); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * sds->busiest->__cpu_power, - (sds->avg_load - sds->this_load) * sds->this->__cpu_power) + *imbalance = min(max_pull * sds->busiest->cpu_power, + (sds->avg_load - sds->this_load) * sds->this->cpu_power) / SCHED_LOAD_SCALE; /* @@ -3988,15 +4046,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, int i; for_each_cpu(i, sched_group_cpus(group)) { + unsigned long power = power_of(i); + unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); unsigned long wl; if (!cpumask_test_cpu(i, cpus)) continue; rq = cpu_rq(i); - wl = weighted_cpuload(i); + wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; + wl /= power; - if (rq->nr_running == 1 && wl > imbalance) + if (capacity && rq->nr_running == 1 && wl > imbalance) continue; if (wl > max_load) { @@ -5031,17 +5092,16 @@ void account_idle_time(cputime_t cputime) */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t one_jiffy = jiffies_to_cputime(1); - cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); if (user_tick) - account_user_time(p, one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, one_jiffy_scaled); else - account_idle_time(one_jiffy); + account_idle_time(cputime_one_jiffy); } /* @@ -5145,7 +5205,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); spin_unlock(&rq->lock); - perf_counter_task_tick(curr, cpu); + perf_event_task_tick(curr, cpu); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -5257,14 +5317,13 @@ static inline void schedule_debug(struct task_struct *prev) #endif } -static void put_prev_task(struct rq *rq, struct task_struct *prev) +static void put_prev_task(struct rq *rq, struct task_struct *p) { - if (prev->state == TASK_RUNNING) { - u64 runtime = prev->se.sum_exec_runtime; + u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; - runtime -= prev->se.prev_sum_exec_runtime; - runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + update_avg(&p->se.avg_running, runtime); + if (p->state == TASK_RUNNING) { /* * In order to avoid avg_overlap growing stale when we are * indeed overlapping and hence not getting put to sleep, grow @@ -5274,9 +5333,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) * correlates to the amount of cache footprint a task can * build up. */ - update_avg(&prev->se.avg_overlap, runtime); + runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + update_avg(&p->se.avg_overlap, runtime); + } else { + update_avg(&p->se.avg_running, 0); } - prev->sched_class->put_prev_task(rq, prev); + p->sched_class->put_prev_task(rq, p); } /* @@ -5325,7 +5387,7 @@ need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_qsctr_inc(cpu); + rcu_sched_qs(cpu); prev = rq->curr; switch_count = &prev->nivcsw; @@ -5349,10 +5411,7 @@ need_resched_nonpreemptible: switch_count = &prev->nvcsw; } -#ifdef CONFIG_SMP - if (prev->sched_class->pre_schedule) - prev->sched_class->pre_schedule(rq, prev); -#endif + pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); @@ -5362,7 +5421,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_counter_task_sched_out(prev, next, cpu); + perf_event_task_sched_out(prev, next, cpu); rq->nr_switches++; rq->curr = next; @@ -5378,6 +5437,8 @@ need_resched_nonpreemptible: } else spin_unlock_irq(&rq->lock); + post_schedule(rq); + if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; @@ -5509,10 +5570,10 @@ asmlinkage void __sched preempt_schedule_irq(void) #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, +int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { - return try_to_wake_up(curr->private, mode, sync); + return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -5526,14 +5587,14 @@ EXPORT_SYMBOL(default_wake_function); * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key) + int nr_exclusive, int wake_flags, void *key) { wait_queue_t *curr, *next; list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; - if (curr->func(curr, mode, sync, key) && + if (curr->func(curr, mode, wake_flags, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } @@ -5594,16 +5655,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; - int sync = 1; + int wake_flags = WF_SYNC; if (unlikely(!q)) return; if (unlikely(!nr_exclusive)) - sync = 0; + wake_flags = 0; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, sync, key); + __wake_up_common(q, mode, nr_exclusive, wake_flags, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); @@ -6123,17 +6184,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, unsigned long flags; const struct sched_class *prev_class = p->sched_class; struct rq *rq; + int reset_on_fork; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); recheck: /* double check policy once rq lock held */ - if (policy < 0) + if (policy < 0) { + reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; - else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) - return -EINVAL; + } else { + reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); + policy &= ~SCHED_RESET_ON_FORK; + + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_IDLE) + return -EINVAL; + } + /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, @@ -6177,6 +6246,10 @@ recheck: /* can't change other user's priorities */ if (!check_same_owner(p)) return -EPERM; + + /* Normal users shall not reset the sched_reset_on_fork flag */ + if (p->sched_reset_on_fork && !reset_on_fork) + return -EPERM; } if (user) { @@ -6220,6 +6293,8 @@ recheck: if (running) p->sched_class->put_prev_task(rq, p); + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); @@ -6336,14 +6411,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) if (p) { retval = security_task_getscheduler(p); if (!retval) - retval = p->policy; + retval = p->policy + | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } read_unlock(&tasklist_lock); return retval; } /** - * sys_sched_getscheduler - get the RT priority of a thread + * sys_sched_getparam - get the RT priority of a thread * @pid: the pid in question. * @param: structure containing the RT priority. */ @@ -6571,19 +6647,9 @@ static inline int should_resched(void) static void __cond_resched(void) { -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP - __might_sleep(__FILE__, __LINE__); -#endif - /* - * The BKS might be reacquired before we have dropped - * PREEMPT_ACTIVE, which could trigger a second - * cond_resched() call. - */ - do { - add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); - } while (need_resched()); + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); } int __sched _cond_resched(void) @@ -6597,18 +6663,20 @@ int __sched _cond_resched(void) EXPORT_SYMBOL(_cond_resched); /* - * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * __cond_resched_lock() - if a reschedule is pending, drop the given lock, * call schedule, and on return reacquire the lock. * * This works OK both with and without CONFIG_PREEMPT. We do strange low-level * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t *lock) +int __cond_resched_lock(spinlock_t *lock) { int resched = should_resched(); int ret = 0; + lockdep_assert_held(lock); + if (spin_needbreak(lock) || resched) { spin_unlock(lock); if (resched) @@ -6620,9 +6688,9 @@ int cond_resched_lock(spinlock_t *lock) } return ret; } -EXPORT_SYMBOL(cond_resched_lock); +EXPORT_SYMBOL(__cond_resched_lock); -int __sched cond_resched_softirq(void) +int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); @@ -6634,7 +6702,7 @@ int __sched cond_resched_softirq(void) } return 0; } -EXPORT_SYMBOL(cond_resched_softirq); +EXPORT_SYMBOL(__cond_resched_softirq); /** * yield - yield the current processor to other threads. @@ -6658,11 +6726,13 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct rq *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = raw_rq(); delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + current->in_iowait = 1; schedule(); + current->in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); } @@ -6670,12 +6740,14 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct rq *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = raw_rq(); long ret; delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + current->in_iowait = 1; ret = schedule_timeout(timeout); + current->in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); return ret; @@ -6759,23 +6831,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - /* - * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER - * tasks that are on an otherwise idle runqueue: - */ - time_slice = 0; - if (p->policy == SCHED_RR) { - time_slice = DEF_TIMESLICE; - } else if (p->policy != SCHED_FIFO) { - struct sched_entity *se = &p->se; - unsigned long flags; - struct rq *rq; + time_slice = p->sched_class->get_rr_interval(p); - rq = task_rq_lock(p, &flags); - if (rq->cfs.load.weight) - time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); - task_rq_unlock(rq, &flags); - } read_unlock(&tasklist_lock); jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; @@ -6992,8 +7049,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ + struct task_struct *mt = rq->migration_thread; + + get_task_struct(mt); task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); + put_task_struct(mt); wait_for_completion(&req.done); tlb_migrate_finish(p->mm); return 0; @@ -7051,6 +7112,11 @@ fail: return ret; } +#define RCU_MIGRATION_IDLE 0 +#define RCU_MIGRATION_NEED_QS 1 +#define RCU_MIGRATION_GOT_QS 2 +#define RCU_MIGRATION_MUST_SYNC 3 + /* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto @@ -7058,6 +7124,7 @@ fail: */ static int migration_thread(void *data) { + int badcpu; int cpu = (long)data; struct rq *rq; @@ -7092,8 +7159,17 @@ static int migration_thread(void *data) req = list_entry(head->next, struct migration_req, list); list_del_init(head->next); - spin_unlock(&rq->lock); - __migrate_task(req->task, cpu, req->dest_cpu); + if (req->task != NULL) { + spin_unlock(&rq->lock); + __migrate_task(req->task, cpu, req->dest_cpu); + } else if (likely(cpu == (badcpu = smp_processor_id()))) { + req->dest_cpu = RCU_MIGRATION_GOT_QS; + spin_unlock(&rq->lock); + } else { + req->dest_cpu = RCU_MIGRATION_MUST_SYNC; + spin_unlock(&rq->lock); + WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); + } local_irq_enable(); complete(&req->done); @@ -7607,7 +7683,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* * Register at high priority so that task migration (migrate_all_tasks) * happens before everything else. This has to be lower priority than - * the notifier in the perf_counter subsystem, though. + * the notifier in the perf_event subsystem, though. */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, @@ -7625,7 +7701,7 @@ static int __init migration_init(void) migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); - return err; + return 0; } early_initcall(migration_init); #endif @@ -7672,7 +7748,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->__cpu_power) { + if (!group->cpu_power) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -7696,9 +7772,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->__cpu_power != SCHED_LOAD_SCALE) { - printk(KERN_CONT " (__cpu_power = %d)", - group->__cpu_power); + if (group->cpu_power != SCHED_LOAD_SCALE) { + printk(KERN_CONT " (cpu_power = %d)", + group->cpu_power); } group = group->next; @@ -7763,9 +7839,7 @@ static int sd_degenerate(struct sched_domain *sd) } /* Following flags don't use groups */ - if (sd->flags & (SD_WAKE_IDLE | - SD_WAKE_AFFINE | - SD_WAKE_BALANCE)) + if (sd->flags & (SD_WAKE_AFFINE)) return 0; return 1; @@ -7782,10 +7856,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; - /* Does parent contain flags not in child? */ - /* WAKE_BALANCE is a subset of WAKE_AFFINE */ - if (cflags & SD_WAKE_AFFINE) - pflags &= ~SD_WAKE_BALANCE; /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { pflags &= ~(SD_LOAD_BALANCE | @@ -7841,7 +7911,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) rq->rd = rd; cpumask_set_cpu(rq->cpu, rd->span); - if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); @@ -7983,7 +8053,7 @@ init_sched_build_groups(const struct cpumask *span, continue; cpumask_clear(sched_group_cpus(sg)); - sg->__cpu_power = 0; + sg->cpu_power = 0; for_each_cpu(j, span) { if (group_fn(j, cpu_map, NULL, tmpmask) != group) @@ -8091,6 +8161,39 @@ struct static_sched_domain { DECLARE_BITMAP(span, CONFIG_NR_CPUS); }; +struct s_data { +#ifdef CONFIG_NUMA + int sd_allnodes; + cpumask_var_t domainspan; + cpumask_var_t covered; + cpumask_var_t notcovered; +#endif + cpumask_var_t nodemask; + cpumask_var_t this_sibling_map; + cpumask_var_t this_core_map; + cpumask_var_t send_covered; + cpumask_var_t tmpmask; + struct sched_group **sched_group_nodes; + struct root_domain *rd; +}; + +enum s_alloc { + sa_sched_groups = 0, + sa_rootdomain, + sa_tmpmask, + sa_send_covered, + sa_this_core_map, + sa_this_sibling_map, + sa_nodemask, + sa_sched_group_nodes, +#ifdef CONFIG_NUMA + sa_notcovered, + sa_covered, + sa_domainspan, +#endif + sa_none, +}; + /* * SMT sched-domains: */ @@ -8208,11 +8311,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) continue; } - sg_inc_cpu_power(sg, sd->groups->__cpu_power); + sg->cpu_power += sd->groups->cpu_power; } sg = sg->next; } while (sg != group_head); } + +static int build_numa_sched_groups(struct s_data *d, + const struct cpumask *cpu_map, int num) +{ + struct sched_domain *sd; + struct sched_group *sg, *prev; + int n, j; + + cpumask_clear(d->covered); + cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); + if (cpumask_empty(d->nodemask)) { + d->sched_group_nodes[num] = NULL; + goto out; + } + + sched_domain_node_span(num, d->domainspan); + cpumask_and(d->domainspan, d->domainspan, cpu_map); + + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, num); + if (!sg) { + printk(KERN_WARNING "Can not alloc domain group for node %d\n", + num); + return -ENOMEM; + } + d->sched_group_nodes[num] = sg; + + for_each_cpu(j, d->nodemask) { + sd = &per_cpu(node_domains, j).sd; + sd->groups = sg; + } + + sg->cpu_power = 0; + cpumask_copy(sched_group_cpus(sg), d->nodemask); + sg->next = sg; + cpumask_or(d->covered, d->covered, d->nodemask); + + prev = sg; + for (j = 0; j < nr_node_ids; j++) { + n = (num + j) % nr_node_ids; + cpumask_complement(d->notcovered, d->covered); + cpumask_and(d->tmpmask, d->notcovered, cpu_map); + cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); + if (cpumask_empty(d->tmpmask)) + break; + cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); + if (cpumask_empty(d->tmpmask)) + continue; + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, num); + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", j); + return -ENOMEM; + } + sg->cpu_power = 0; + cpumask_copy(sched_group_cpus(sg), d->tmpmask); + sg->next = prev->next; + cpumask_or(d->covered, d->covered, d->tmpmask); + prev->next = sg; + prev = sg; + } +out: + return 0; +} #endif /* CONFIG_NUMA */ #ifdef CONFIG_NUMA @@ -8266,15 +8434,13 @@ static void free_sched_groups(const struct cpumask *cpu_map, * there are asymmetries in the topology. If there are asymmetries, group * having more cpu_power will pickup more load compared to the group having * less cpu_power. - * - * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents - * the maximum number of tasks a group can handle in the presence of other idle - * or lightly loaded groups in the same sched domain. */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { struct sched_domain *child; struct sched_group *group; + long power; + int weight; WARN_ON(!sd || !sd->groups); @@ -8283,28 +8449,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) child = sd->child; - sd->groups->__cpu_power = 0; + sd->groups->cpu_power = 0; - /* - * For perf policy, if the groups in child domain share resources - * (for example cores sharing some portions of the cache hierarchy - * or SMT), then set this domain groups cpu_power such that each group - * can handle only one task, when there are other idle groups in the - * same sched domain. - */ - if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && - (child->flags & - (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { - sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); + if (!child) { + power = SCHED_LOAD_SCALE; + weight = cpumask_weight(sched_domain_span(sd)); + /* + * SMT siblings share the power of a single core. + * Usually multiple threads get a better yield out of + * that one core than a single thread would have, + * reflect that in sd->smt_gain. + */ + if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { + power *= sd->smt_gain; + power /= weight; + power >>= SCHED_LOAD_SHIFT; + } + sd->groups->cpu_power += power; return; } /* - * add cpu_power of each child group to this groups cpu_power + * Add cpu_power of each child group to this groups cpu_power. */ group = child->groups; do { - sg_inc_cpu_power(sd->groups, group->__cpu_power); + sd->groups->cpu_power += group->cpu_power; group = group->next; } while (group != child->groups); } @@ -8371,287 +8541,292 @@ static void set_domain_attribute(struct sched_domain *sd, request = attr->relax_domain_level; if (request < sd->level) { /* turn off idle balance on this domain */ - sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); + sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } else { /* turn on idle balance on this domain */ - sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); + sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); + } +} + +static void __free_domain_allocs(struct s_data *d, enum s_alloc what, + const struct cpumask *cpu_map) +{ + switch (what) { + case sa_sched_groups: + free_sched_groups(cpu_map, d->tmpmask); /* fall through */ + d->sched_group_nodes = NULL; + case sa_rootdomain: + free_rootdomain(d->rd); /* fall through */ + case sa_tmpmask: + free_cpumask_var(d->tmpmask); /* fall through */ + case sa_send_covered: + free_cpumask_var(d->send_covered); /* fall through */ + case sa_this_core_map: + free_cpumask_var(d->this_core_map); /* fall through */ + case sa_this_sibling_map: + free_cpumask_var(d->this_sibling_map); /* fall through */ + case sa_nodemask: + free_cpumask_var(d->nodemask); /* fall through */ + case sa_sched_group_nodes: +#ifdef CONFIG_NUMA + kfree(d->sched_group_nodes); /* fall through */ + case sa_notcovered: + free_cpumask_var(d->notcovered); /* fall through */ + case sa_covered: + free_cpumask_var(d->covered); /* fall through */ + case sa_domainspan: + free_cpumask_var(d->domainspan); /* fall through */ +#endif + case sa_none: + break; } } -/* - * Build sched domains for a given set of cpus and attach the sched domains - * to the individual cpus - */ -static int __build_sched_domains(const struct cpumask *cpu_map, - struct sched_domain_attr *attr) +static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, + const struct cpumask *cpu_map) { - int i, err = -ENOMEM; - struct root_domain *rd; - cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, - tmpmask; #ifdef CONFIG_NUMA - cpumask_var_t domainspan, covered, notcovered; - struct sched_group **sched_group_nodes = NULL; - int sd_allnodes = 0; - - if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) - goto out; - if (!alloc_cpumask_var(&covered, GFP_KERNEL)) - goto free_domainspan; - if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) - goto free_covered; -#endif - - if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) - goto free_notcovered; - if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) - goto free_nodemask; - if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) - goto free_this_sibling_map; - if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) - goto free_this_core_map; - if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) - goto free_send_covered; - -#ifdef CONFIG_NUMA - /* - * Allocate the per-node list of sched groups - */ - sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), - GFP_KERNEL); - if (!sched_group_nodes) { + if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) + return sa_none; + if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) + return sa_domainspan; + if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) + return sa_covered; + /* Allocate the per-node list of sched groups */ + d->sched_group_nodes = kcalloc(nr_node_ids, + sizeof(struct sched_group *), GFP_KERNEL); + if (!d->sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); - goto free_tmpmask; - } -#endif - - rd = alloc_rootdomain(); - if (!rd) { + return sa_notcovered; + } + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; +#endif + if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) + return sa_sched_group_nodes; + if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) + return sa_nodemask; + if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) + return sa_this_sibling_map; + if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + return sa_this_core_map; + if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) + return sa_send_covered; + d->rd = alloc_rootdomain(); + if (!d->rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); - goto free_sched_groups; + return sa_tmpmask; } + return sa_rootdomain; +} +static struct sched_domain *__build_numa_sched_domains(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) +{ + struct sched_domain *sd = NULL; #ifdef CONFIG_NUMA - sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; -#endif - - /* - * Set up domains for cpus specified by the cpu_map. - */ - for_each_cpu(i, cpu_map) { - struct sched_domain *sd = NULL, *p; - - cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); - -#ifdef CONFIG_NUMA - if (cpumask_weight(cpu_map) > - SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { - sd = &per_cpu(allnodes_domains, i).sd; - SD_INIT(sd, ALLNODES); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), cpu_map); - cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); - p = sd; - sd_allnodes = 1; - } else - p = NULL; + struct sched_domain *parent; - sd = &per_cpu(node_domains, i).sd; - SD_INIT(sd, NODE); + d->sd_allnodes = 0; + if (cpumask_weight(cpu_map) > + SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { + sd = &per_cpu(allnodes_domains, i).sd; + SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); - sd->parent = p; - if (p) - p->child = sd; - cpumask_and(sched_domain_span(sd), - sched_domain_span(sd), cpu_map); + cpumask_copy(sched_domain_span(sd), cpu_map); + cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); + d->sd_allnodes = 1; + } + parent = sd; + + sd = &per_cpu(node_domains, i).sd; + SD_INIT(sd, NODE); + set_domain_attribute(sd, attr); + sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); + sd->parent = parent; + if (parent) + parent->child = sd; + cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); #endif + return sd; +} - p = sd; - sd = &per_cpu(phys_domains, i).sd; - SD_INIT(sd, CPU); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), nodemask); - sd->parent = p; - if (p) - p->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); +static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd; + sd = &per_cpu(phys_domains, i).sd; + SD_INIT(sd, CPU); + set_domain_attribute(sd, attr); + cpumask_copy(sched_domain_span(sd), d->nodemask); + sd->parent = parent; + if (parent) + parent->child = sd; + cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); + return sd; +} +static struct sched_domain *__build_mc_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = parent; #ifdef CONFIG_SCHED_MC - p = sd; - sd = &per_cpu(core_domains, i).sd; - SD_INIT(sd, MC); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, - cpu_coregroup_mask(i)); - sd->parent = p; - p->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); + sd = &per_cpu(core_domains, i).sd; + SD_INIT(sd, MC); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); + sd->parent = parent; + parent->child = sd; + cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); #endif + return sd; +} +static struct sched_domain *__build_smt_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = parent; #ifdef CONFIG_SCHED_SMT - p = sd; - sd = &per_cpu(cpu_domains, i).sd; - SD_INIT(sd, SIBLING); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), - topology_thread_cpumask(i), cpu_map); - sd->parent = p; - p->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); + sd = &per_cpu(cpu_domains, i).sd; + SD_INIT(sd, SIBLING); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); + sd->parent = parent; + parent->child = sd; + cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); #endif - } + return sd; +} +static void build_sched_groups(struct s_data *d, enum sched_domain_level l, + const struct cpumask *cpu_map, int cpu) +{ + switch (l) { #ifdef CONFIG_SCHED_SMT - /* Set up CPU (sibling) groups */ - for_each_cpu(i, cpu_map) { - cpumask_and(this_sibling_map, - topology_thread_cpumask(i), cpu_map); - if (i != cpumask_first(this_sibling_map)) - continue; - - init_sched_build_groups(this_sibling_map, cpu_map, - &cpu_to_cpu_group, - send_covered, tmpmask); - } + case SD_LV_SIBLING: /* set up CPU (sibling) groups */ + cpumask_and(d->this_sibling_map, cpu_map, + topology_thread_cpumask(cpu)); + if (cpu == cpumask_first(d->this_sibling_map)) + init_sched_build_groups(d->this_sibling_map, cpu_map, + &cpu_to_cpu_group, + d->send_covered, d->tmpmask); + break; #endif - #ifdef CONFIG_SCHED_MC - /* Set up multi-core groups */ - for_each_cpu(i, cpu_map) { - cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); - if (i != cpumask_first(this_core_map)) - continue; - - init_sched_build_groups(this_core_map, cpu_map, - &cpu_to_core_group, - send_covered, tmpmask); - } + case SD_LV_MC: /* set up multi-core groups */ + cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); + if (cpu == cpumask_first(d->this_core_map)) + init_sched_build_groups(d->this_core_map, cpu_map, + &cpu_to_core_group, + d->send_covered, d->tmpmask); + break; #endif - - /* Set up physical groups */ - for (i = 0; i < nr_node_ids; i++) { - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) - continue; - - init_sched_build_groups(nodemask, cpu_map, - &cpu_to_phys_group, - send_covered, tmpmask); - } - + case SD_LV_CPU: /* set up physical groups */ + cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); + if (!cpumask_empty(d->nodemask)) + init_sched_build_groups(d->nodemask, cpu_map, + &cpu_to_phys_group, + d->send_covered, d->tmpmask); + break; #ifdef CONFIG_NUMA - /* Set up node groups */ - if (sd_allnodes) { - init_sched_build_groups(cpu_map, cpu_map, - &cpu_to_allnodes_group, - send_covered, tmpmask); + case SD_LV_ALLNODES: + init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, + d->send_covered, d->tmpmask); + break; +#endif + default: + break; } +} - for (i = 0; i < nr_node_ids; i++) { - /* Set up node groups */ - struct sched_group *sg, *prev; - int j; - - cpumask_clear(covered); - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) { - sched_group_nodes[i] = NULL; - continue; - } +/* + * Build sched domains for a given set of cpus and attach the sched domains + * to the individual cpus + */ +static int __build_sched_domains(const struct cpumask *cpu_map, + struct sched_domain_attr *attr) +{ + enum s_alloc alloc_state = sa_none; + struct s_data d; + struct sched_domain *sd; + int i; +#ifdef CONFIG_NUMA + d.sd_allnodes = 0; +#endif - sched_domain_node_span(i, domainspan); - cpumask_and(domainspan, domainspan, cpu_map); + alloc_state = __visit_domain_allocation_hell(&d, cpu_map); + if (alloc_state != sa_rootdomain) + goto error; + alloc_state = sa_sched_groups; - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, i); - if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for " - "node %d\n", i); - goto error; - } - sched_group_nodes[i] = sg; - for_each_cpu(j, nodemask) { - struct sched_domain *sd; + /* + * Set up domains for cpus specified by the cpu_map. + */ + for_each_cpu(i, cpu_map) { + cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), + cpu_map); - sd = &per_cpu(node_domains, j).sd; - sd->groups = sg; - } - sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), nodemask); - sg->next = sg; - cpumask_or(covered, covered, nodemask); - prev = sg; + sd = __build_numa_sched_domains(&d, cpu_map, attr, i); + sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); + sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); + sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); + } - for (j = 0; j < nr_node_ids; j++) { - int n = (i + j) % nr_node_ids; + for_each_cpu(i, cpu_map) { + build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); + build_sched_groups(&d, SD_LV_MC, cpu_map, i); + } - cpumask_complement(notcovered, covered); - cpumask_and(tmpmask, notcovered, cpu_map); - cpumask_and(tmpmask, tmpmask, domainspan); - if (cpumask_empty(tmpmask)) - break; + /* Set up physical groups */ + for (i = 0; i < nr_node_ids; i++) + build_sched_groups(&d, SD_LV_CPU, cpu_map, i); - cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); - if (cpumask_empty(tmpmask)) - continue; +#ifdef CONFIG_NUMA + /* Set up node groups */ + if (d.sd_allnodes) + build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); - sg = kmalloc_node(sizeof(struct sched_group) + - cpumask_size(), - GFP_KERNEL, i); - if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); - goto error; - } - sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), tmpmask); - sg->next = prev->next; - cpumask_or(covered, covered, tmpmask); - prev->next = sg; - prev = sg; - } - } + for (i = 0; i < nr_node_ids; i++) + if (build_numa_sched_groups(&d, cpu_map, i)) + goto error; #endif /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; - + sd = &per_cpu(cpu_domains, i).sd; init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(core_domains, i).sd; - + sd = &per_cpu(core_domains, i).sd; init_sched_groups_power(i, sd); } #endif for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(phys_domains, i).sd; - + sd = &per_cpu(phys_domains, i).sd; init_sched_groups_power(i, sd); } #ifdef CONFIG_NUMA for (i = 0; i < nr_node_ids; i++) - init_numa_sched_groups_power(sched_group_nodes[i]); + init_numa_sched_groups_power(d.sched_group_nodes[i]); - if (sd_allnodes) { + if (d.sd_allnodes) { struct sched_group *sg; cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, - tmpmask); + d.tmpmask); init_numa_sched_groups_power(sg); } #endif /* Attach the domains */ for_each_cpu(i, cpu_map) { - struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) @@ -8659,44 +8834,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map, #else sd = &per_cpu(phys_domains, i).sd; #endif - cpu_attach_domain(sd, rd, i); + cpu_attach_domain(sd, d.rd, i); } - err = 0; - -free_tmpmask: - free_cpumask_var(tmpmask); -free_send_covered: - free_cpumask_var(send_covered); -free_this_core_map: - free_cpumask_var(this_core_map); -free_this_sibling_map: - free_cpumask_var(this_sibling_map); -free_nodemask: - free_cpumask_var(nodemask); -free_notcovered: -#ifdef CONFIG_NUMA - free_cpumask_var(notcovered); -free_covered: - free_cpumask_var(covered); -free_domainspan: - free_cpumask_var(domainspan); -out: -#endif - return err; - -free_sched_groups: -#ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - goto free_tmpmask; + d.sched_group_nodes = NULL; /* don't free this we still need it */ + __free_domain_allocs(&d, sa_tmpmask, cpu_map); + return 0; -#ifdef CONFIG_NUMA error: - free_sched_groups(cpu_map, tmpmask); - free_rootdomain(rd); - goto free_tmpmask; -#endif + __free_domain_allocs(&d, alloc_state, cpu_map); + return -ENOMEM; } static int build_sched_domains(const struct cpumask *cpu_map) @@ -9015,6 +9162,7 @@ void __init sched_init_smp(void) cpumask_var_t non_isolated_cpus; alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); #if defined(CONFIG_NUMA) sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), @@ -9046,7 +9194,6 @@ void __init sched_init_smp(void) sched_init_granularity(); free_cpumask_var(non_isolated_cpus); - alloc_cpumask_var(&fallback_doms, GFP_KERNEL); init_sched_rt_class(); } #else @@ -9304,11 +9451,11 @@ void __init sched_init(void) * system cpu resource, based on the weight assigned to root * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished * by letting tasks of init_task_group sit in a separate cfs_rq - * (init_cfs_rq) and having one entity represent this group of + * (init_tg_cfs_rq) and having one entity represent this group of * tasks in rq->cfs (i.e init_task_group->se[] != NULL). */ init_tg_cfs_entry(&init_task_group, - &per_cpu(init_cfs_rq, i), + &per_cpu(init_tg_cfs_rq, i), &per_cpu(init_sched_entity, i), i, 1, root_task_group.se[i]); @@ -9334,6 +9481,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; + rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; @@ -9392,19 +9540,26 @@ void __init sched_init(void) alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ - perf_counter_init(); + perf_event_init(); scheduler_running = 1; } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -void __might_sleep(char *file, int line) +static inline int preempt_count_equals(int preempt_offset) +{ + int nested = preempt_count() & ~PREEMPT_ACTIVE; + + return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); +} + +void __might_sleep(char *file, int line, int preempt_offset) { #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ - if ((!in_atomic() && !irqs_disabled()) || - system_state != SYSTEM_RUNNING || oops_in_progress) + if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; @@ -10581,3 +10736,113 @@ struct cgroup_subsys cpuacct_subsys = { .subsys_id = cpuacct_subsys_id, }; #endif /* CONFIG_CGROUP_CPUACCT */ + +#ifndef CONFIG_SMP + +int rcu_expedited_torture_stats(char *page) +{ + return 0; +} +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); + +void synchronize_sched_expedited(void) +{ +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#else /* #ifndef CONFIG_SMP */ + +static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); +static DEFINE_MUTEX(rcu_sched_expedited_mutex); + +#define RCU_EXPEDITED_STATE_POST -2 +#define RCU_EXPEDITED_STATE_IDLE -1 + +static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + +int rcu_expedited_torture_stats(char *page) +{ + int cnt = 0; + int cpu; + + cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); + for_each_online_cpu(cpu) { + cnt += sprintf(&page[cnt], " %d:%d", + cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); + } + cnt += sprintf(&page[cnt], "\n"); + return cnt; +} +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); + +static long synchronize_sched_expedited_count; + +/* + * Wait for an rcu-sched grace period to elapse, but use "big hammer" + * approach to force grace period to end quickly. This consumes + * significant time on all CPUs, and is thus not recommended for + * any sort of common-case code. + * + * Note that it is illegal to call this function while holding any + * lock that is acquired by a CPU-hotplug notifier. Failing to + * observe this restriction will result in deadlock. + */ +void synchronize_sched_expedited(void) +{ + int cpu; + unsigned long flags; + bool need_full_sync = 0; + struct rq *rq; + struct migration_req *req; + long snap; + int trycount = 0; + + smp_mb(); /* ensure prior mod happens before capturing snap. */ + snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; + get_online_cpus(); + while (!mutex_trylock(&rcu_sched_expedited_mutex)) { + put_online_cpus(); + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_sched(); + return; + } + if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + get_online_cpus(); + } + rcu_expedited_state = RCU_EXPEDITED_STATE_POST; + for_each_online_cpu(cpu) { + rq = cpu_rq(cpu); + req = &per_cpu(rcu_migration_req, cpu); + init_completion(&req->done); + req->task = NULL; + req->dest_cpu = RCU_MIGRATION_NEED_QS; + spin_lock_irqsave(&rq->lock, flags); + list_add(&req->list, &rq->migration_queue); + spin_unlock_irqrestore(&rq->lock, flags); + wake_up_process(rq->migration_thread); + } + for_each_online_cpu(cpu) { + rcu_expedited_state = cpu; + req = &per_cpu(rcu_migration_req, cpu); + rq = cpu_rq(cpu); + wait_for_completion(&req->done); + spin_lock_irqsave(&rq->lock, flags); + if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) + need_full_sync = 1; + req->dest_cpu = RCU_MIGRATION_IDLE; + spin_unlock_irqrestore(&rq->lock, flags); + } + rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + mutex_unlock(&rcu_sched_expedited_mutex); + put_online_cpus(); + if (need_full_sync) + synchronize_sched(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#endif /* #else #ifndef CONFIG_SMP */ diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e1d16c9..ac2e1dc 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running; __read_mostly int sched_clock_stable; struct sched_clock_data { - /* - * Raw spinlock - this is a special case: this might be called - * from within instrumentation code so we dont want to do any - * instrumentation ourselves. - */ - raw_spinlock_t lock; - u64 tick_raw; u64 tick_gtod; u64 clock; @@ -80,7 +73,6 @@ void sched_clock_init(void) for_each_possible_cpu(cpu) { struct sched_clock_data *scd = cpu_sdc(cpu); - scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; @@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y) * - filter out backward motion * - use the GTOD tick value to create a window to filter crazy TSC values */ -static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) +static u64 sched_clock_local(struct sched_clock_data *scd) { - s64 delta = now - scd->tick_raw; - u64 clock, min_clock, max_clock; + u64 now, clock, old_clock, min_clock, max_clock; + s64 delta; +again: + now = sched_clock(); + delta = now - scd->tick_raw; if (unlikely(delta < 0)) delta = 0; + old_clock = scd->clock; + /* * scd->clock = clamp(scd->tick_gtod + delta, * max(scd->tick_gtod, scd->clock), @@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) */ clock = scd->tick_gtod + delta; - min_clock = wrap_max(scd->tick_gtod, scd->clock); - max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); + min_clock = wrap_max(scd->tick_gtod, old_clock); + max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - scd->clock = clock; + if (cmpxchg(&scd->clock, old_clock, clock) != old_clock) + goto again; - return scd->clock; + return clock; } -static void lock_double_clock(struct sched_clock_data *data1, - struct sched_clock_data *data2) +static u64 sched_clock_remote(struct sched_clock_data *scd) { - if (data1 < data2) { - __raw_spin_lock(&data1->lock); - __raw_spin_lock(&data2->lock); + struct sched_clock_data *my_scd = this_scd(); + u64 this_clock, remote_clock; + u64 *ptr, old_val, val; + + sched_clock_local(my_scd); +again: + this_clock = my_scd->clock; + remote_clock = scd->clock; + + /* + * Use the opportunity that we have both locks + * taken to couple the two clocks: we take the + * larger time as the latest time for both + * runqueues. (this creates monotonic movement) + */ + if (likely((s64)(remote_clock - this_clock) < 0)) { + ptr = &scd->clock; + old_val = remote_clock; + val = this_clock; } else { - __raw_spin_lock(&data2->lock); - __raw_spin_lock(&data1->lock); + /* + * Should be rare, but possible: + */ + ptr = &my_scd->clock; + old_val = this_clock; + val = remote_clock; } + + if (cmpxchg(ptr, old_val, val) != old_val) + goto again; + + return val; } u64 sched_clock_cpu(int cpu) { - u64 now, clock, this_clock, remote_clock; struct sched_clock_data *scd; + u64 clock; + + WARN_ON_ONCE(!irqs_disabled()); if (sched_clock_stable) return sched_clock(); - scd = cpu_sdc(cpu); - - /* - * Normally this is not called in NMI context - but if it is, - * trying to do any locking here is totally lethal. - */ - if (unlikely(in_nmi())) - return scd->clock; - if (unlikely(!sched_clock_running)) return 0ull; - WARN_ON_ONCE(!irqs_disabled()); - now = sched_clock(); - - if (cpu != raw_smp_processor_id()) { - struct sched_clock_data *my_scd = this_scd(); - - lock_double_clock(scd, my_scd); - - this_clock = __update_sched_clock(my_scd, now); - remote_clock = scd->clock; - - /* - * Use the opportunity that we have both locks - * taken to couple the two clocks: we take the - * larger time as the latest time for both - * runqueues. (this creates monotonic movement) - */ - if (likely((s64)(remote_clock - this_clock) < 0)) { - clock = this_clock; - scd->clock = clock; - } else { - /* - * Should be rare, but possible: - */ - clock = remote_clock; - my_scd->clock = remote_clock; - } - - __raw_spin_unlock(&my_scd->lock); - } else { - __raw_spin_lock(&scd->lock); - clock = __update_sched_clock(scd, now); - } + scd = cpu_sdc(cpu); - __raw_spin_unlock(&scd->lock); + if (cpu != smp_processor_id()) + clock = sched_clock_remote(scd); + else + clock = sched_clock_local(scd); return clock; } @@ -223,11 +209,9 @@ void sched_clock_tick(void) now_gtod = ktime_to_ns(ktime_get()); now = sched_clock(); - __raw_spin_lock(&scd->lock); scd->tick_raw = now; scd->tick_gtod = now_gtod; - __update_sched_clock(scd, now); - __raw_spin_unlock(&scd->lock); + sched_clock_local(scd); } /* diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index d014efb..0f052fc 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) /* * If the cpu was currently mapped to a different value, we - * first need to unmap the old value + * need to map it to the new value then remove the old value. + * Note, we must add the new value first, otherwise we risk the + * cpu being cleared from pri_active, and this cpu could be + * missed for a push or pull. */ - if (likely(oldpri != CPUPRI_INVALID)) { - struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; - - spin_lock_irqsave(&vec->lock, flags); - - vec->count--; - if (!vec->count) - clear_bit(oldpri, cp->pri_active); - cpumask_clear_cpu(cpu, vec->mask); - - spin_unlock_irqrestore(&vec->lock, flags); - } - if (likely(newpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; @@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) spin_unlock_irqrestore(&vec->lock, flags); } + if (likely(oldpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; + + spin_lock_irqsave(&vec->lock, flags); + + vec->count--; + if (!vec->count) + clear_bit(oldpri, cp->pri_active); + cpumask_clear_cpu(cpu, vec->mask); + + spin_unlock_irqrestore(&vec->lock, flags); + } *currpri = newpri; } diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 70c7e0b..efb8440 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.sum_exec_runtime); PN(se.avg_overlap); PN(se.avg_wakeup); + PN(se.avg_running); nr_switches = p->nvcsw + p->nivcsw; @@ -409,6 +410,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.wait_max); PN(se.wait_sum); P(se.wait_count); + PN(se.iowait_sum); + P(se.iowait_count); P(sched_info.bkl_count); P(se.nr_migrations); P(se.nr_migrations_cold); @@ -479,6 +482,8 @@ void proc_sched_set_task(struct task_struct *p) p->se.wait_max = 0; p->se.wait_sum = 0; p->se.wait_count = 0; + p->se.iowait_sum = 0; + p->se.iowait_count = 0; p->se.sleep_max = 0; p->se.sum_sleep_runtime = 0; p->se.block_max = 0; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 652e8bd..ecc637a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -24,7 +24,7 @@ /* * Targeted preemption latency for CPU-bound tasks: - * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) * * NOTE: this latency value is not the same as the concept of * 'timeslice length' - timeslices in CFS are of variable length @@ -34,13 +34,13 @@ * (to see the precise effective timeslice length of your workload, * run vmstat and monitor the context-switches (cs) field) */ -unsigned int sysctl_sched_latency = 20000000ULL; +unsigned int sysctl_sched_latency = 5000000ULL; /* * Minimal preemption granularity for CPU-bound tasks: - * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 4000000ULL; +unsigned int sysctl_sched_min_granularity = 1000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity @@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL; static unsigned int sched_nr_latency = 5; /* - * After fork, child runs first. (default) If set to 0 then + * After fork, child runs first. If set to 0 (default) then * parent will (try to) run first. */ -const_debug unsigned int sysctl_sched_child_runs_first = 1; +unsigned int sysctl_sched_child_runs_first __read_mostly; /* * sys_sched_yield() compat mode @@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield; /* * SCHED_OTHER wake-up granularity. - * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity = 5000000UL; +unsigned int sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class; * CFS operations on generic schedulable entities: */ -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - #ifdef CONFIG_FAIR_GROUP_SCHED /* cpu runqueue to which this cfs_rq is attached */ @@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) +static inline struct task_struct *task_of(struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(!entity_is_task(se)); +#endif + return container_of(se, struct task_struct, se); +} + /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent) @@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) } } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} static inline struct rq *rq_of(struct cfs_rq *cfs_rq) { @@ -505,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); + trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); cpuacct_charge(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } @@ -537,6 +546,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) schedstat_set(se->wait_count, se->wait_count + 1); schedstat_set(se->wait_sum, se->wait_sum + rq_of(cfs_rq)->clock - se->wait_start); +#ifdef CONFIG_SCHEDSTATS + if (entity_is_task(se)) { + trace_sched_stat_wait(task_of(se), + rq_of(cfs_rq)->clock - se->wait_start); + } +#endif schedstat_set(se->wait_start, 0); } @@ -628,8 +643,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->sleep_start = 0; se->sum_sleep_runtime += delta; - if (tsk) + if (tsk) { account_scheduler_latency(tsk, delta >> 10, 1); + trace_sched_stat_sleep(tsk, delta); + } } if (se->block_start) { u64 delta = rq_of(cfs_rq)->clock - se->block_start; @@ -644,6 +661,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->sum_sleep_runtime += delta; if (tsk) { + if (tsk->in_iowait) { + se->iowait_sum += delta; + se->iowait_count++; + trace_sched_stat_iowait(tsk, delta); + } + /* * Blocking time is in units of nanosecs, so shift by * 20 to get a milliseconds-range estimation of the @@ -687,29 +710,33 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (initial && sched_feat(START_DEBIT)) vruntime += sched_vslice(cfs_rq, se); - if (!initial) { - /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS)) { - unsigned long thresh = sysctl_sched_latency; + /* sleeps up to a single latency don't count. */ + if (!initial && sched_feat(FAIR_SLEEPERS)) { + unsigned long thresh = sysctl_sched_latency; - /* - * Convert the sleeper threshold into virtual time. - * SCHED_IDLE is a special sub-class. We care about - * fairness only relative to other SCHED_IDLE tasks, - * all of which have the same weight. - */ - if (sched_feat(NORMALIZED_SLEEPER) && - (!entity_is_task(se) || - task_of(se)->policy != SCHED_IDLE)) - thresh = calc_delta_fair(thresh, se); + /* + * Convert the sleeper threshold into virtual time. + * SCHED_IDLE is a special sub-class. We care about + * fairness only relative to other SCHED_IDLE tasks, + * all of which have the same weight. + */ + if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) || + task_of(se)->policy != SCHED_IDLE)) + thresh = calc_delta_fair(thresh, se); - vruntime -= thresh; - } + /* + * Halve their sleep time's effect, to allow + * for a gentler effect of sleepers: + */ + if (sched_feat(GENTLE_FAIR_SLEEPERS)) + thresh >>= 1; - /* ensure we never gain time by being placed backwards. */ - vruntime = max_vruntime(se->vruntime, vruntime); + vruntime -= thresh; } + /* ensure we never gain time by being placed backwards. */ + vruntime = max_vruntime(se->vruntime, vruntime); + se->vruntime = vruntime; } @@ -735,10 +762,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->last == se) + if (!se || cfs_rq->last == se) cfs_rq->last = NULL; - if (cfs_rq->next == se) + if (!se || cfs_rq->next == se) cfs_rq->next = NULL; } @@ -1040,79 +1067,6 @@ static void yield_task_fair(struct rq *rq) se->vruntime = rightmost->vruntime + 1; } -/* - * wake_idle() will wake a task on an idle cpu if task->cpu is - * not idle and an idle cpu is available. The span of cpus to - * search starts with cpus closest then further out as needed, - * so we always favor a closer, idle cpu. - * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (cpu_active_mask) - * - * Returns the CPU we should wake onto. - */ -#if defined(ARCH_HAS_SCHED_WAKE_IDLE) -static int wake_idle(int cpu, struct task_struct *p) -{ - struct sched_domain *sd; - int i; - unsigned int chosen_wakeup_cpu; - int this_cpu; - - /* - * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu - * are idle and this is not a kernel thread and this task's affinity - * allows it to be moved to preferred cpu, then just move! - */ - - this_cpu = smp_processor_id(); - chosen_wakeup_cpu = - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu; - - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP && - idle_cpu(cpu) && idle_cpu(this_cpu) && - p->mm && !(p->flags & PF_KTHREAD) && - cpu_isset(chosen_wakeup_cpu, p->cpus_allowed)) - return chosen_wakeup_cpu; - - /* - * If it is idle, then it is the best cpu to run this task. - * - * This cpu is also the best, if it has more than one task already. - * Siblings must be also busy(in most cases) as they didn't already - * pickup the extra load from this cpu and hence we need not check - * sibling runqueue info. This will avoid the checks and cache miss - * penalities associated with that. - */ - if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1) - return cpu; - - for_each_domain(cpu, sd) { - if ((sd->flags & SD_WAKE_IDLE) - || ((sd->flags & SD_WAKE_IDLE_FAR) - && !task_hot(p, task_rq(p)->clock, sd))) { - for_each_cpu_and(i, sched_domain_span(sd), - &p->cpus_allowed) { - if (cpu_active(i) && idle_cpu(i)) { - if (i != task_cpu(p)) { - schedstat_inc(p, - se.nr_wakeups_idle); - } - return i; - } - } - } else { - break; - } - } - return cpu; -} -#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/ -static inline int wake_idle(int cpu, struct task_struct *p) -{ - return cpu; -} -#endif - #ifdef CONFIG_SMP #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1199,25 +1153,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif -static int -wake_affine(struct sched_domain *this_sd, struct rq *this_rq, - struct task_struct *p, int prev_cpu, int this_cpu, int sync, - int idx, unsigned long load, unsigned long this_load, - unsigned int imbalance) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { - struct task_struct *curr = this_rq->curr; - struct task_group *tg; - unsigned long tl = this_load; + struct task_struct *curr = current; + unsigned long this_load, load; + int idx, this_cpu, prev_cpu; unsigned long tl_per_task; + unsigned int imbalance; + struct task_group *tg; unsigned long weight; int balanced; - if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) - return 0; + idx = sd->wake_idx; + this_cpu = smp_processor_id(); + prev_cpu = task_cpu(p); + load = source_load(prev_cpu, idx); + this_load = target_load(this_cpu, idx); - if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || - p->se.avg_overlap > sysctl_sched_migration_cost)) - sync = 0; + if (sync) { + if (sched_feat(SYNC_LESS) && + (curr->se.avg_overlap > sysctl_sched_migration_cost || + p->se.avg_overlap > sysctl_sched_migration_cost)) + sync = 0; + } else { + if (sched_feat(SYNC_MORE) && + (curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost)) + sync = 1; + } /* * If sync wakeup then subtract the (maximum possible) @@ -1228,14 +1191,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, tg = task_group(current); weight = current->se.load.weight; - tl += effective_load(tg, this_cpu, -weight, -weight); + this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); weight = p->se.load.weight; - balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= + imbalance = 100 + (sd->imbalance_pct - 100) / 2; + + /* + * In low-load situations, where prev_cpu is idle and this_cpu is idle + * due to the sync cause above having dropped this_load to 0, we'll + * always have an imbalance, but there's really nothing you can do + * about that, so that's good too. + * + * Otherwise check if either cpus are near enough in load to allow this + * task to be woken on this_cpu. + */ + balanced = !this_load || + 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); /* @@ -1249,14 +1224,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= - tl_per_task)) { + if (balanced || + (this_load <= load && + this_load + target_load(prev_cpu, idx) <= tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and * there is no bad imbalance. */ - schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(sd, ttwu_move_affine); schedstat_inc(p, se.nr_wakeups_affine); return 1; @@ -1264,67 +1240,216 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, return 0; } -static int select_task_rq_fair(struct task_struct *p, int sync) +/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int load_idx) { - struct sched_domain *sd, *this_sd = NULL; - int prev_cpu, this_cpu, new_cpu; - unsigned long load, this_load; - struct rq *this_rq; - unsigned int imbalance; - int idx; + struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + unsigned long min_load = ULONG_MAX, this_load = 0; + int imbalance = 100 + (sd->imbalance_pct-100)/2; - prev_cpu = task_cpu(p); - this_cpu = smp_processor_id(); - this_rq = cpu_rq(this_cpu); - new_cpu = prev_cpu; + do { + unsigned long load, avg_load; + int local_group; + int i; - if (prev_cpu == this_cpu) - goto out; - /* - * 'this_sd' is the first domain that both - * this_cpu and prev_cpu are present in: - */ - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { - this_sd = sd; - break; + /* Skip over this group if it has no CPUs allowed */ + if (!cpumask_intersects(sched_group_cpus(group), + &p->cpus_allowed)) + continue; + + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu(i, sched_group_cpus(group)) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = source_load(i, load_idx); + else + load = target_load(i, load_idx); + + avg_load += load; + } + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + } else if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + } while (group = group->next, group != sd->groups); + + if (!idlest || 100*this_load < imbalance*min_load) + return NULL; + return idlest; +} + +/* + * find_idlest_cpu - find the idlest cpu among the cpus in group. + */ +static int +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +{ + unsigned long load, min_load = ULONG_MAX; + int idlest = -1; + int i; + + /* Traverse only the allowed CPUs */ + for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { + load = weighted_cpuload(i); + + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + idlest = i; } } - if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) - goto out; + return idlest; +} - /* - * Check for affine wakeup and passive balancing possibilities. - */ - if (!this_sd) +/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. + * + * Balance, ie. select the least loaded group. + * + * Returns the target CPU number, or the same CPU if no balancing is needed. + * + * preempt must be disabled. + */ +static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +{ + struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int new_cpu = cpu; + int want_affine = 0; + int want_sd = 1; + int sync = wake_flags & WF_SYNC; + + if (sd_flag & SD_BALANCE_WAKE) { + if (sched_feat(AFFINE_WAKEUPS) && + cpumask_test_cpu(cpu, &p->cpus_allowed)) + want_affine = 1; + new_cpu = prev_cpu; + } + + rcu_read_lock(); + for_each_domain(cpu, tmp) { + /* + * If power savings logic is enabled for a domain, see if we + * are not overloaded, if so, don't balance wider. + */ + if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { + unsigned long power = 0; + unsigned long nr_running = 0; + unsigned long capacity; + int i; + + for_each_cpu(i, sched_domain_span(tmp)) { + power += power_of(i); + nr_running += cpu_rq(i)->cfs.nr_running; + } + + capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); + + if (tmp->flags & SD_POWERSAVINGS_BALANCE) + nr_running /= 2; + + if (nr_running < capacity) + want_sd = 0; + } + + if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && + cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + + affine_sd = tmp; + want_affine = 0; + } + + if (!want_sd && !want_affine) + break; + + if (!(tmp->flags & sd_flag)) + continue; + + if (want_sd) + sd = tmp; + } + + if (sched_feat(LB_SHARES_UPDATE)) { + /* + * Pick the largest domain to update shares over + */ + tmp = sd; + if (affine_sd && (!tmp || + cpumask_weight(sched_domain_span(affine_sd)) > + cpumask_weight(sched_domain_span(sd)))) + tmp = affine_sd; + + if (tmp) + update_shares(tmp); + } + + if (affine_sd && wake_affine(affine_sd, p, sync)) { + new_cpu = cpu; goto out; + } + + while (sd) { + int load_idx = sd->forkexec_idx; + struct sched_group *group; + int weight; - idx = this_sd->wake_idx; + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } - imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; + if (sd_flag & SD_BALANCE_WAKE) + load_idx = sd->wake_idx; - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); + group = find_idlest_group(sd, p, cpu, load_idx); + if (!group) { + sd = sd->child; + continue; + } - if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, - load, this_load, imbalance)) - return this_cpu; + new_cpu = find_idlest_cpu(group, p, cpu); + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - if (this_sd->flags & SD_WAKE_BALANCE) { - if (imbalance*this_load <= 100*load) { - schedstat_inc(this_sd, ttwu_move_balance); - schedstat_inc(p, se.nr_wakeups_passive); - return this_cpu; + /* Now try balancing at a lower domain level of new_cpu */ + cpu = new_cpu; + weight = cpumask_weight(sched_domain_span(sd)); + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= cpumask_weight(sched_domain_span(tmp))) + break; + if (tmp->flags & sd_flag) + sd = tmp; } + /* while loop will break here if sd == NULL */ } out: - return wake_idle(new_cpu, p); + rcu_read_unlock(); + return new_cpu; } #endif /* CONFIG_SMP */ @@ -1437,11 +1562,12 @@ static void set_next_buddy(struct sched_entity *se) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); + int sync = wake_flags & WF_SYNC; update_curr(cfs_rq); @@ -1467,7 +1593,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) */ if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) set_last_buddy(se); - set_next_buddy(pse); + if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) + set_next_buddy(pse); /* * We can come here with TIF_NEED_RESCHED already set from new task @@ -1489,16 +1616,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) return; } - if (!sched_feat(WAKEUP_PREEMPT)) - return; - - if (sched_feat(WAKEUP_OVERLAP) && (sync || - (se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost))) { + if ((sched_feat(WAKEUP_SYNC) && sync) || + (sched_feat(WAKEUP_OVERLAP) && + (se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost))) { resched_task(curr); return; } + if (sched_feat(WAKEUP_RUNNING)) { + if (pse->avg_running < se->avg_running) { + set_next_buddy(pse); + resched_task(curr); + return; + } + } + + if (!sched_feat(WAKEUP_PREEMPT)) + return; + find_matching_se(&se, &pse); BUG_ON(!pse); @@ -1521,8 +1657,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) /* * If se was a buddy, clear it so that it will have to earn * the favour again. + * + * If se was not a buddy, clear the buddies because neither + * was elegible to run, let them earn it again. + * + * IOW. unconditionally clear buddies. */ - __clear_buddies(cfs_rq, se); + __clear_buddies(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -1721,6 +1862,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) sched_info_queued(p); update_curr(cfs_rq); + if (curr) + se->vruntime = curr->vruntime; place_entity(cfs_rq, se, 1); /* 'curr' will be NULL if the child belongs to a different group */ @@ -1796,6 +1939,25 @@ static void moved_group_fair(struct task_struct *p) } #endif +unsigned int get_rr_interval_fair(struct task_struct *task) +{ + struct sched_entity *se = &task->se; + unsigned long flags; + struct rq *rq; + unsigned int rr_interval = 0; + + /* + * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise + * idle runqueue: + */ + rq = task_rq_lock(task, &flags); + if (rq->cfs.load.weight) + rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); + task_rq_unlock(rq, &flags); + + return rr_interval; +} + /* * All the scheduling class methods: */ @@ -1824,6 +1986,8 @@ static const struct sched_class fair_sched_class = { .prio_changed = prio_changed_fair, .switched_to = switched_to_fair, + .get_rr_interval = get_rr_interval_fair, + #ifdef CONFIG_FAIR_GROUP_SCHED .moved_group = moved_group_fair, #endif diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 4569bfa..0d94083 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -1,17 +1,123 @@ -SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) +/* + * Disregards a certain amount of sleep time (sched_latency_ns) and + * considers the task to be running during that period. This gives it + * a service deficit on wakeup, allowing it to run sooner. + */ +SCHED_FEAT(FAIR_SLEEPERS, 1) + +/* + * Only give sleepers 50% of their service deficit. This allows + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) + +/* + * By not normalizing the sleep time, heavy tasks get an effective + * longer period, and lighter task an effective shorter period they + * are considered running. + */ SCHED_FEAT(NORMALIZED_SLEEPER, 0) -SCHED_FEAT(ADAPTIVE_GRAN, 1) -SCHED_FEAT(WAKEUP_PREEMPT, 1) + +/* + * Place new tasks ahead so that they do not starve already running + * tasks + */ SCHED_FEAT(START_DEBIT, 1) + +/* + * Should wakeups try to preempt running tasks. + */ +SCHED_FEAT(WAKEUP_PREEMPT, 1) + +/* + * Compute wakeup_gran based on task behaviour, clipped to + * [0, sched_wakeup_gran_ns] + */ +SCHED_FEAT(ADAPTIVE_GRAN, 1) + +/* + * When converting the wakeup granularity to virtual time, do it such + * that heavier tasks preempting a lighter task have an edge. + */ +SCHED_FEAT(ASYM_GRAN, 1) + +/* + * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS. + */ +SCHED_FEAT(WAKEUP_SYNC, 0) + +/* + * Wakeup preempt based on task behaviour. Tasks that do not overlap + * don't get preempted. + */ +SCHED_FEAT(WAKEUP_OVERLAP, 0) + +/* + * Wakeup preemption towards tasks that run short + */ +SCHED_FEAT(WAKEUP_RUNNING, 0) + +/* + * Use the SYNC wakeup hint, pipes and the likes use this to indicate + * the remote end is likely to consume the data we just wrote, and + * therefore has cache benefit from being placed on the same cpu, see + * also AFFINE_WAKEUPS. + */ +SCHED_FEAT(SYNC_WAKEUPS, 1) + +/* + * Based on load and program behaviour, see if it makes sense to place + * a newly woken task on the same cpu as the task that woke it -- + * improve cache locality. Typically used with SYNC wakeups as + * generated by pipes and the like, see also SYNC_WAKEUPS. + */ SCHED_FEAT(AFFINE_WAKEUPS, 1) + +/* + * Weaken SYNC hint based on overlap + */ +SCHED_FEAT(SYNC_LESS, 1) + +/* + * Add SYNC hint based on overlap + */ +SCHED_FEAT(SYNC_MORE, 0) + +/* + * Prefer to schedule the task we woke last (assuming it failed + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. + */ +SCHED_FEAT(NEXT_BUDDY, 0) + +/* + * Prefer to schedule the task that ran last (when we did + * wake-preempt) as that likely will touch the same data, increases + * cache locality. + */ +SCHED_FEAT(LAST_BUDDY, 1) + +/* + * Consider buddies to be cache hot, decreases the likelyness of a + * cache buddy being migrated away, increases cache locality. + */ SCHED_FEAT(CACHE_HOT_BUDDY, 1) -SCHED_FEAT(SYNC_WAKEUPS, 1) + +/* + * Use arch dependent cpu power functions + */ +SCHED_FEAT(ARCH_POWER, 0) + SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) -SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) -SCHED_FEAT(LB_WAKEUP_UPDATE, 1) +SCHED_FEAT(LB_SHARES_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) -SCHED_FEAT(WAKEUP_OVERLAP, 0) -SCHED_FEAT(LAST_BUDDY, 1) + +/* + * Spin-wait on mutex acquisition when the mutex owner is running on + * another cpu -- assumes that when the owner is running, it will soon + * release the lock. Decreases scheduling overhead. + */ SCHED_FEAT(OWNER_SPIN, 1) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 499672c..b133a28 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -6,7 +6,7 @@ */ #ifdef CONFIG_SMP -static int select_task_rq_idle(struct task_struct *p, int sync) +static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } @@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) { resched_task(rq->idle); } @@ -97,6 +97,11 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, check_preempt_curr(rq, p, 0); } +unsigned int get_rr_interval_idle(struct task_struct *task) +{ + return 0; +} + /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -122,6 +127,8 @@ static const struct sched_class idle_sched_class = { .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, + .get_rr_interval = get_rr_interval_idle, + .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 3918e01..a4d790c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -3,15 +3,18 @@ * policies) */ +#ifdef CONFIG_RT_GROUP_SCHED + +#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) + static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) { +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(!rt_entity_is_task(rt_se)); +#endif return container_of(rt_se, struct task_struct, rt); } -#ifdef CONFIG_RT_GROUP_SCHED - -#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) - static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { return rt_rq->rq; @@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) #define rt_entity_is_task(rt_se) (1) +static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) +{ + return container_of(rt_se, struct task_struct, rt); +} + static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { return container_of(rt_rq, struct rq, rt); @@ -128,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); } +static inline int has_pushable_tasks(struct rq *rq) +{ + return !plist_head_empty(&rq->rt.pushable_tasks); +} + #else static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) @@ -602,6 +615,8 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); + sched_rt_avg_update(rq, delta_exec); + if (!rt_bandwidth_enabled()) return; @@ -874,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); - - inc_cpu_load(rq, p->se.load.weight); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) @@ -886,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) dequeue_rt_entity(rt_se); dequeue_pushable_task(rq, p); - - dec_cpu_load(rq, p->se.load.weight); } /* @@ -927,10 +938,13 @@ static void yield_task_rt(struct rq *rq) #ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); -static int select_task_rq_rt(struct task_struct *p, int sync) +static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) { struct rq *rq = task_rq(p); + if (sd_flag != SD_BALANCE_WAKE) + return smp_processor_id(); + /* * If the current task is an RT task, then * try to see if we can wake this RT task up on another @@ -988,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) { if (p->prio < rq->curr->prio) { resched_task(rq->curr); @@ -1064,6 +1078,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) if (p) dequeue_pushable_task(rq, p); +#ifdef CONFIG_SMP + /* + * We detect this state here so that we can avoid taking the RQ + * lock again later if there is no need to push + */ + rq->post_schedule = has_pushable_tasks(rq); +#endif + return p; } @@ -1162,13 +1184,6 @@ static int find_lowest_rq(struct task_struct *task) return -1; /* No targets found */ /* - * Only consider CPUs that are usable for migration. - * I guess we might want to change cpupri_find() to ignore those - * in the first place. - */ - cpumask_and(lowest_mask, lowest_mask, cpu_active_mask); - - /* * At this point we have built a mask of cpus representing the * lowest priority tasks in the system. Now we want to elect * the best one based on our affinity and topology. @@ -1262,11 +1277,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } -static inline int has_pushable_tasks(struct rq *rq) -{ - return !plist_head_empty(&rq->rt.pushable_tasks); -} - static struct task_struct *pick_next_pushable_task(struct rq *rq) { struct task_struct *p; @@ -1466,23 +1476,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) pull_rt_task(rq); } -/* - * assumes rq->lock is held - */ -static int needs_post_schedule_rt(struct rq *rq) -{ - return has_pushable_tasks(rq); -} - static void post_schedule_rt(struct rq *rq) { - /* - * This is only called if needs_post_schedule_rt() indicates that - * we need to push tasks away - */ - spin_lock_irq(&rq->lock); push_rt_tasks(rq); - spin_unlock_irq(&rq->lock); } /* @@ -1738,6 +1734,17 @@ static void set_curr_task_rt(struct rq *rq) dequeue_pushable_task(rq, p); } +unsigned int get_rr_interval_rt(struct task_struct *task) +{ + /* + * Time slice is 0 for SCHED_FIFO tasks + */ + if (task->policy == SCHED_RR) + return DEF_TIMESLICE; + else + return 0; +} + static const struct sched_class rt_sched_class = { .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, @@ -1758,7 +1765,6 @@ static const struct sched_class rt_sched_class = { .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, - .needs_post_schedule = needs_post_schedule_rt, .post_schedule = post_schedule_rt, .task_wake_up = task_wake_up_rt, .switched_from = switched_from_rt, @@ -1767,6 +1773,8 @@ static const struct sched_class rt_sched_class = { .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, + .get_rr_interval = get_rr_interval_rt, + .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, }; diff --git a/kernel/smp.c b/kernel/smp.c index 94188b8..fd47a25 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -29,8 +29,7 @@ enum { struct call_function_data { struct call_single_data csd; - spinlock_t lock; - unsigned int refs; + atomic_t refs; cpumask_var_t cpumask; }; @@ -39,9 +38,7 @@ struct call_single_queue { spinlock_t lock; }; -static DEFINE_PER_CPU(struct call_function_data, cfd_data) = { - .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock), -}; +static DEFINE_PER_CPU(struct call_function_data, cfd_data); static int hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -177,6 +174,11 @@ void generic_smp_call_function_interrupt(void) int cpu = get_cpu(); /* + * Shouldn't receive this interrupt on a cpu that is not yet online. + */ + WARN_ON_ONCE(!cpu_online(cpu)); + + /* * Ensure entry is visible on call_function_queue after we have * entered the IPI. See comment in smp_call_function_many. * If we don't have this, then we may miss an entry on the list @@ -191,25 +193,18 @@ void generic_smp_call_function_interrupt(void) list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; - spin_lock(&data->lock); - if (!cpumask_test_cpu(cpu, data->cpumask)) { - spin_unlock(&data->lock); + if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) continue; - } - cpumask_clear_cpu(cpu, data->cpumask); - spin_unlock(&data->lock); data->csd.func(data->csd.info); - spin_lock(&data->lock); - WARN_ON(data->refs == 0); - refs = --data->refs; + refs = atomic_dec_return(&data->refs); + WARN_ON(refs < 0); if (!refs) { spin_lock(&call_function.lock); list_del_rcu(&data->csd.list); spin_unlock(&call_function.lock); } - spin_unlock(&data->lock); if (refs) continue; @@ -230,6 +225,11 @@ void generic_smp_call_function_single_interrupt(void) unsigned int data_flags; LIST_HEAD(list); + /* + * Shouldn't receive this interrupt on a cpu that is not yet online. + */ + WARN_ON_ONCE(!cpu_online(smp_processor_id())); + spin_lock(&q->lock); list_replace_init(&q->list, &list); spin_unlock(&q->lock); @@ -285,8 +285,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, */ this_cpu = get_cpu(); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() + && !oops_in_progress); if (cpu == this_cpu) { local_irq_save(flags); @@ -329,8 +335,14 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, { csd_lock(data); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() + && !oops_in_progress); generic_exec_single(cpu, data, wait); } @@ -365,8 +377,14 @@ void smp_call_function_many(const struct cpumask *mask, unsigned long flags; int cpu, next_cpu, this_cpu = smp_processor_id(); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() + && !oops_in_progress); /* So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); @@ -391,23 +409,20 @@ void smp_call_function_many(const struct cpumask *mask, data = &__get_cpu_var(cfd_data); csd_lock(&data->csd); - spin_lock_irqsave(&data->lock, flags); data->csd.func = func; data->csd.info = info; cpumask_and(data->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, data->cpumask); - data->refs = cpumask_weight(data->cpumask); + atomic_set(&data->refs, cpumask_weight(data->cpumask)); - spin_lock(&call_function.lock); + spin_lock_irqsave(&call_function.lock, flags); /* * Place entry at the _HEAD_ of the list, so that any cpu still * observing the entry in generic_smp_call_function_interrupt() * will not miss any other list entries: */ list_add_rcu(&data->csd.list, &call_function.queue); - spin_unlock(&call_function.lock); - - spin_unlock_irqrestore(&data->lock, flags); + spin_unlock_irqrestore(&call_function.lock, flags); /* * Make the list addition visible before sending the ipi. diff --git a/kernel/softirq.c b/kernel/softirq.c index eb5e131..f8749e5 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -57,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { - "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", + "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", "TASKLET", "SCHED", "HRTIMER", "RCU" }; @@ -227,7 +227,7 @@ restart: preempt_count() = prev_count; } - rcu_bh_qsctr_inc(cpu); + rcu_bh_qs(cpu); } h++; pending >>= 1; @@ -721,7 +721,7 @@ static int ksoftirqd(void * __bind_cpu) preempt_enable_no_resched(); cond_resched(); preempt_disable(); - rcu_qsctr_inc((long)__bind_cpu); + rcu_sched_qs((long)__bind_cpu); } preempt_enable(); set_current_state(TASK_INTERRUPTIBLE); diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 7932653..5ddab73 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -21,44 +21,29 @@ #include <linux/debug_locks.h> #include <linux/module.h> +#ifndef _spin_trylock int __lockfunc _spin_trylock(spinlock_t *lock) { - preempt_disable(); - if (_raw_spin_trylock(lock)) { - spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); - return 1; - } - - preempt_enable(); - return 0; + return __spin_trylock(lock); } EXPORT_SYMBOL(_spin_trylock); +#endif +#ifndef _read_trylock int __lockfunc _read_trylock(rwlock_t *lock) { - preempt_disable(); - if (_raw_read_trylock(lock)) { - rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_); - return 1; - } - - preempt_enable(); - return 0; + return __read_trylock(lock); } EXPORT_SYMBOL(_read_trylock); +#endif +#ifndef _write_trylock int __lockfunc _write_trylock(rwlock_t *lock) { - preempt_disable(); - if (_raw_write_trylock(lock)) { - rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_); - return 1; - } - - preempt_enable(); - return 0; + return __write_trylock(lock); } EXPORT_SYMBOL(_write_trylock); +#endif /* * If lockdep is enabled then we use the non-preemption spin-ops @@ -67,132 +52,101 @@ EXPORT_SYMBOL(_write_trylock); */ #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) +#ifndef _read_lock void __lockfunc _read_lock(rwlock_t *lock) { - preempt_disable(); - rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); + __read_lock(lock); } EXPORT_SYMBOL(_read_lock); +#endif +#ifndef _spin_lock_irqsave unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) { - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - /* - * On lockdep we dont want the hand-coded irq-enable of - * _raw_spin_lock_flags() code, because lockdep assumes - * that interrupts are not re-enabled during lock-acquire: - */ -#ifdef CONFIG_LOCKDEP - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); -#else - _raw_spin_lock_flags(lock, &flags); -#endif - return flags; + return __spin_lock_irqsave(lock); } EXPORT_SYMBOL(_spin_lock_irqsave); +#endif +#ifndef _spin_lock_irq void __lockfunc _spin_lock_irq(spinlock_t *lock) { - local_irq_disable(); - preempt_disable(); - spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); + __spin_lock_irq(lock); } EXPORT_SYMBOL(_spin_lock_irq); +#endif +#ifndef _spin_lock_bh void __lockfunc _spin_lock_bh(spinlock_t *lock) { - local_bh_disable(); - preempt_disable(); - spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); + __spin_lock_bh(lock); } EXPORT_SYMBOL(_spin_lock_bh); +#endif +#ifndef _read_lock_irqsave unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) { - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock, - _raw_read_lock_flags, &flags); - return flags; + return __read_lock_irqsave(lock); } EXPORT_SYMBOL(_read_lock_irqsave); +#endif +#ifndef _read_lock_irq void __lockfunc _read_lock_irq(rwlock_t *lock) { - local_irq_disable(); - preempt_disable(); - rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); + __read_lock_irq(lock); } EXPORT_SYMBOL(_read_lock_irq); +#endif +#ifndef _read_lock_bh void __lockfunc _read_lock_bh(rwlock_t *lock) { - local_bh_disable(); - preempt_disable(); - rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); + __read_lock_bh(lock); } EXPORT_SYMBOL(_read_lock_bh); +#endif +#ifndef _write_lock_irqsave unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) { - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock, - _raw_write_lock_flags, &flags); - return flags; + return __write_lock_irqsave(lock); } EXPORT_SYMBOL(_write_lock_irqsave); +#endif +#ifndef _write_lock_irq void __lockfunc _write_lock_irq(rwlock_t *lock) { - local_irq_disable(); - preempt_disable(); - rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); + __write_lock_irq(lock); } EXPORT_SYMBOL(_write_lock_irq); +#endif +#ifndef _write_lock_bh void __lockfunc _write_lock_bh(rwlock_t *lock) { - local_bh_disable(); - preempt_disable(); - rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); + __write_lock_bh(lock); } EXPORT_SYMBOL(_write_lock_bh); +#endif +#ifndef _spin_lock void __lockfunc _spin_lock(spinlock_t *lock) { - preempt_disable(); - spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); + __spin_lock(lock); } - EXPORT_SYMBOL(_spin_lock); +#endif +#ifndef _write_lock void __lockfunc _write_lock(rwlock_t *lock) { - preempt_disable(); - rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); + __write_lock(lock); } - EXPORT_SYMBOL(_write_lock); +#endif #else /* CONFIG_PREEMPT: */ @@ -318,125 +272,109 @@ EXPORT_SYMBOL(_spin_lock_nest_lock); #endif +#ifndef _spin_unlock void __lockfunc _spin_unlock(spinlock_t *lock) { - spin_release(&lock->dep_map, 1, _RET_IP_); - _raw_spin_unlock(lock); - preempt_enable(); + __spin_unlock(lock); } EXPORT_SYMBOL(_spin_unlock); +#endif +#ifndef _write_unlock void __lockfunc _write_unlock(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_write_unlock(lock); - preempt_enable(); + __write_unlock(lock); } EXPORT_SYMBOL(_write_unlock); +#endif +#ifndef _read_unlock void __lockfunc _read_unlock(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_read_unlock(lock); - preempt_enable(); + __read_unlock(lock); } EXPORT_SYMBOL(_read_unlock); +#endif +#ifndef _spin_unlock_irqrestore void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { - spin_release(&lock->dep_map, 1, _RET_IP_); - _raw_spin_unlock(lock); - local_irq_restore(flags); - preempt_enable(); + __spin_unlock_irqrestore(lock, flags); } EXPORT_SYMBOL(_spin_unlock_irqrestore); +#endif +#ifndef _spin_unlock_irq void __lockfunc _spin_unlock_irq(spinlock_t *lock) { - spin_release(&lock->dep_map, 1, _RET_IP_); - _raw_spin_unlock(lock); - local_irq_enable(); - preempt_enable(); + __spin_unlock_irq(lock); } EXPORT_SYMBOL(_spin_unlock_irq); +#endif +#ifndef _spin_unlock_bh void __lockfunc _spin_unlock_bh(spinlock_t *lock) { - spin_release(&lock->dep_map, 1, _RET_IP_); - _raw_spin_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + __spin_unlock_bh(lock); } EXPORT_SYMBOL(_spin_unlock_bh); +#endif +#ifndef _read_unlock_irqrestore void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_read_unlock(lock); - local_irq_restore(flags); - preempt_enable(); + __read_unlock_irqrestore(lock, flags); } EXPORT_SYMBOL(_read_unlock_irqrestore); +#endif +#ifndef _read_unlock_irq void __lockfunc _read_unlock_irq(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_read_unlock(lock); - local_irq_enable(); - preempt_enable(); + __read_unlock_irq(lock); } EXPORT_SYMBOL(_read_unlock_irq); +#endif +#ifndef _read_unlock_bh void __lockfunc _read_unlock_bh(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_read_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + __read_unlock_bh(lock); } EXPORT_SYMBOL(_read_unlock_bh); +#endif +#ifndef _write_unlock_irqrestore void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_write_unlock(lock); - local_irq_restore(flags); - preempt_enable(); + __write_unlock_irqrestore(lock, flags); } EXPORT_SYMBOL(_write_unlock_irqrestore); +#endif +#ifndef _write_unlock_irq void __lockfunc _write_unlock_irq(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_write_unlock(lock); - local_irq_enable(); - preempt_enable(); + __write_unlock_irq(lock); } EXPORT_SYMBOL(_write_unlock_irq); +#endif +#ifndef _write_unlock_bh void __lockfunc _write_unlock_bh(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_write_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + __write_unlock_bh(lock); } EXPORT_SYMBOL(_write_unlock_bh); +#endif +#ifndef _spin_trylock_bh int __lockfunc _spin_trylock_bh(spinlock_t *lock) { - local_bh_disable(); - preempt_disable(); - if (_raw_spin_trylock(lock)) { - spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); - return 1; - } - - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); - return 0; + return __spin_trylock_bh(lock); } EXPORT_SYMBOL(_spin_trylock_bh); +#endif notrace int in_lock_functions(unsigned long addr) { diff --git a/kernel/sys.c b/kernel/sys.c index b3f1097..ebcb156 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -14,7 +14,7 @@ #include <linux/prctl.h> #include <linux/highuid.h> #include <linux/fs.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/resource.h> #include <linux/kernel.h> #include <linux/kexec.h> @@ -1338,6 +1338,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) unsigned long flags; cputime_t utime, stime; struct task_cputime cputime; + unsigned long maxrss = 0; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; @@ -1346,6 +1347,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) utime = task_utime(current); stime = task_stime(current); accumulate_thread_rusage(p, r); + maxrss = p->signal->maxrss; goto out; } @@ -1363,6 +1365,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_majflt = p->signal->cmaj_flt; r->ru_inblock = p->signal->cinblock; r->ru_oublock = p->signal->coublock; + maxrss = p->signal->cmaxrss; if (who == RUSAGE_CHILDREN) break; @@ -1377,6 +1380,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_majflt += p->signal->maj_flt; r->ru_inblock += p->signal->inblock; r->ru_oublock += p->signal->oublock; + if (maxrss < p->signal->maxrss) + maxrss = p->signal->maxrss; t = p; do { accumulate_thread_rusage(t, r); @@ -1392,6 +1397,15 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) out: cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); + + if (who != RUSAGE_CHILDREN) { + struct mm_struct *mm = get_task_mm(p); + if (mm) { + setmax_mm_hiwater_rss(&maxrss, mm); + mmput(mm); + } + } + r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ } int getrusage(struct task_struct *p, int who, struct rusage __user *ru) @@ -1511,11 +1525,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; - case PR_TASK_PERF_COUNTERS_DISABLE: - error = perf_counter_task_disable(); + case PR_TASK_PERF_EVENTS_DISABLE: + error = perf_event_task_disable(); break; - case PR_TASK_PERF_COUNTERS_ENABLE: - error = perf_counter_task_enable(); + case PR_TASK_PERF_EVENTS_ENABLE: + error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: error = current->timer_slack_ns; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 68320f6..515bc23 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -177,4 +177,4 @@ cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); /* performance counters: */ -cond_syscall(sys_perf_counter_open); +cond_syscall(sys_perf_event_open); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 58be760..0dfaa47 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,9 +49,8 @@ #include <linux/acpi.h> #include <linux/reboot.h> #include <linux/ftrace.h> -#include <linux/security.h> #include <linux/slow-work.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -92,6 +91,9 @@ extern int sysctl_nr_trim_pages; #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +#ifdef CONFIG_BLOCK +extern int blk_iopoll_enabled; +#endif /* Constants used for minimum and maximum */ #ifdef CONFIG_DETECT_SOFTLOCKUP @@ -104,6 +106,9 @@ static int __maybe_unused one = 1; static int __maybe_unused two = 2; static unsigned long one_ul = 1; static int one_hundred = 100; +#ifdef CONFIG_PRINTK +static int ten_thousand = 10000; +#endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -246,6 +251,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ #endif static struct ctl_table kern_table[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_SCHED_DEBUG { .ctl_name = CTL_UNNUMBERED, @@ -300,14 +313,6 @@ static struct ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_features", .data = &sysctl_sched_features, .maxlen = sizeof(unsigned int), @@ -332,6 +337,14 @@ static struct ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, + .procname = "sched_time_avg", + .data = &sysctl_sched_time_avg, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, .procname = "timer_migration", .data = &sysctl_timer_migration, .maxlen = sizeof(unsigned int), @@ -712,6 +725,17 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "printk_delay", + .data = &printk_delay_msec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &ten_thousand, + }, #endif { .ctl_name = KERN_NGROUPS_MAX, @@ -954,28 +978,28 @@ static struct ctl_table kern_table[] = { .child = slow_work_sysctls, }, #endif -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_paranoid", - .data = &sysctl_perf_counter_paranoid, - .maxlen = sizeof(sysctl_perf_counter_paranoid), + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), .mode = 0644, .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_mlock_kb", - .data = &sysctl_perf_counter_mlock, - .maxlen = sizeof(sysctl_perf_counter_mlock), + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), .mode = 0644, .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_max_sample_rate", - .data = &sysctl_perf_counter_sample_rate, - .maxlen = sizeof(sysctl_perf_counter_sample_rate), + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, .proc_handler = &proc_dointvec, }, @@ -990,7 +1014,16 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif - +#ifdef CONFIG_BLOCK + { + .ctl_name = CTL_UNNUMBERED, + .procname = "blk_iopoll", + .data = &blk_iopoll_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 888adbc..ea8384d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -108,7 +108,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, /* * Send taskstats data in @skb to listener with nl_pid @pid */ -static int send_reply(struct sk_buff *skb, pid_t pid) +static int send_reply(struct sk_buff *skb, struct genl_info *info) { struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); void *reply = genlmsg_data(genlhdr); @@ -120,7 +120,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid) return rc; } - return genlmsg_unicast(skb, pid); + return genlmsg_reply(skb, info); } /* @@ -150,7 +150,7 @@ static void send_cpu_listeners(struct sk_buff *skb, if (!skb_next) break; } - rc = genlmsg_unicast(skb_cur, s->pid); + rc = genlmsg_unicast(&init_net, skb_cur, s->pid); if (rc == -ECONNREFUSED) { s->valid = 0; delcount++; @@ -418,7 +418,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) goto err; } - rc = send_reply(rep_skb, info->snd_pid); + rc = send_reply(rep_skb, info); err: fput_light(file, fput_needed); @@ -487,7 +487,7 @@ free_return_rc: } else goto err; - return send_reply(rep_skb, info->snd_pid); + return send_reply(rep_skb, info); err: nlmsg_free(rep_skb); return rc; diff --git a/kernel/time.c b/kernel/time.c index 2951194..2e2e469 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime); * 0 <= tv_nsec < NSEC_PER_SEC * For negative values only the tv_sec field is negative ! */ -void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) +void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec) { while (nsec >= NSEC_PER_SEC) { + /* + * The following asm() prevents the compiler from + * optimising this loop into a modulo operation. See + * also __iter_div_u64_rem() in include/linux/time.h + */ + asm("" : "+rm"(nsec)); nsec -= NSEC_PER_SEC; ++sec; } while (nsec < 0) { + asm("" : "+rm"(nsec)); nsec += NSEC_PER_SEC; --sec; } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7466cb8..0911334 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -21,7 +21,6 @@ * * TODO WishList: * o Allow clocksource drivers to be unregistered - * o get rid of clocksource_jiffies extern */ #include <linux/clocksource.h> @@ -30,6 +29,7 @@ #include <linux/module.h> #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ #include <linux/tick.h> +#include <linux/kthread.h> void timecounter_init(struct timecounter *tc, const struct cyclecounter *cc, @@ -107,50 +107,35 @@ u64 timecounter_cyc2time(struct timecounter *tc, } EXPORT_SYMBOL(timecounter_cyc2time); -/* XXX - Would like a better way for initializing curr_clocksource */ -extern struct clocksource clocksource_jiffies; - /*[Clocksource internal variables]--------- * curr_clocksource: - * currently selected clocksource. Initialized to clocksource_jiffies. - * next_clocksource: - * pending next selected clocksource. + * currently selected clocksource. * clocksource_list: * linked list with the registered clocksources - * clocksource_lock: - * protects manipulations to curr_clocksource and next_clocksource - * and the clocksource_list + * clocksource_mutex: + * protects manipulations to curr_clocksource and the clocksource_list * override_name: * Name of the user-specified clocksource. */ -static struct clocksource *curr_clocksource = &clocksource_jiffies; -static struct clocksource *next_clocksource; -static struct clocksource *clocksource_override; +static struct clocksource *curr_clocksource; static LIST_HEAD(clocksource_list); -static DEFINE_SPINLOCK(clocksource_lock); +static DEFINE_MUTEX(clocksource_mutex); static char override_name[32]; static int finished_booting; -/* clocksource_done_booting - Called near the end of core bootup - * - * Hack to avoid lots of clocksource churn at boot time. - * We use fs_initcall because we want this to start before - * device_initcall but after subsys_initcall. - */ -static int __init clocksource_done_booting(void) -{ - finished_booting = 1; - return 0; -} -fs_initcall(clocksource_done_booting); - #ifdef CONFIG_CLOCKSOURCE_WATCHDOG +static void clocksource_watchdog_work(struct work_struct *work); + static LIST_HEAD(watchdog_list); static struct clocksource *watchdog; static struct timer_list watchdog_timer; +static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); static cycle_t watchdog_last; -static unsigned long watchdog_resumed; +static int watchdog_running; + +static int clocksource_watchdog_kthread(void *data); +static void __clocksource_change_rating(struct clocksource *cs, int rating); /* * Interval: 0.5sec Threshold: 0.0625s @@ -158,135 +143,249 @@ static unsigned long watchdog_resumed; #define WATCHDOG_INTERVAL (HZ >> 1) #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) -static void clocksource_ratewd(struct clocksource *cs, int64_t delta) +static void clocksource_watchdog_work(struct work_struct *work) { - if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD) - return; + /* + * If kthread_run fails the next watchdog scan over the + * watchdog_list will find the unstable clock again. + */ + kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); +} +static void __clocksource_unstable(struct clocksource *cs) +{ + cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); + cs->flags |= CLOCK_SOURCE_UNSTABLE; + if (finished_booting) + schedule_work(&watchdog_work); +} + +static void clocksource_unstable(struct clocksource *cs, int64_t delta) +{ printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", cs->name, delta); - cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); - clocksource_change_rating(cs, 0); - list_del(&cs->wd_list); + __clocksource_unstable(cs); +} + +/** + * clocksource_mark_unstable - mark clocksource unstable via watchdog + * @cs: clocksource to be marked unstable + * + * This function is called instead of clocksource_change_rating from + * cpu hotplug code to avoid a deadlock between the clocksource mutex + * and the cpu hotplug mutex. It defers the update of the clocksource + * to the watchdog thread. + */ +void clocksource_mark_unstable(struct clocksource *cs) +{ + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { + if (list_empty(&cs->wd_list)) + list_add(&cs->wd_list, &watchdog_list); + __clocksource_unstable(cs); + } + spin_unlock_irqrestore(&watchdog_lock, flags); } static void clocksource_watchdog(unsigned long data) { - struct clocksource *cs, *tmp; + struct clocksource *cs; cycle_t csnow, wdnow; int64_t wd_nsec, cs_nsec; - int resumed; + int next_cpu; spin_lock(&watchdog_lock); - - resumed = test_and_clear_bit(0, &watchdog_resumed); + if (!watchdog_running) + goto out; wdnow = watchdog->read(watchdog); - wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); + wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask, + watchdog->mult, watchdog->shift); watchdog_last = wdnow; - list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { - csnow = cs->read(cs); + list_for_each_entry(cs, &watchdog_list, wd_list) { - if (unlikely(resumed)) { - cs->wd_last = csnow; + /* Clocksource already marked unstable? */ + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + if (finished_booting) + schedule_work(&watchdog_work); continue; } - /* Initialized ? */ + csnow = cs->read(cs); + + /* Clocksource initialized ? */ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { - if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && - (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { - cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - /* - * We just marked the clocksource as - * highres-capable, notify the rest of the - * system as well so that we transition - * into high-res mode: - */ - tick_clock_notify(); - } cs->flags |= CLOCK_SOURCE_WATCHDOG; cs->wd_last = csnow; - } else { - cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); - cs->wd_last = csnow; - /* Check the delta. Might remove from the list ! */ - clocksource_ratewd(cs, cs_nsec - wd_nsec); + continue; } - } - if (!list_empty(&watchdog_list)) { - /* - * Cycle through CPUs to check if the CPUs stay - * synchronized to each other. - */ - int next_cpu = cpumask_next(raw_smp_processor_id(), - cpu_online_mask); + /* Check the deviation from the watchdog clocksource. */ + cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & + cs->mask, cs->mult, cs->shift); + cs->wd_last = csnow; + if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + clocksource_unstable(cs, cs_nsec - wd_nsec); + continue; + } - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(cpu_online_mask); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && + (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && + (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + /* + * We just marked the clocksource as highres-capable, + * notify the rest of the system as well so that we + * transition into high-res mode: + */ + tick_clock_notify(); + } } + + /* + * Cycle through CPUs to check if the CPUs stay synchronized + * to each other. + */ + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, next_cpu); +out: spin_unlock(&watchdog_lock); } + +static inline void clocksource_start_watchdog(void) +{ + if (watchdog_running || !watchdog || list_empty(&watchdog_list)) + return; + init_timer(&watchdog_timer); + watchdog_timer.function = clocksource_watchdog; + watchdog_last = watchdog->read(watchdog); + watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); + watchdog_running = 1; +} + +static inline void clocksource_stop_watchdog(void) +{ + if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) + return; + del_timer(&watchdog_timer); + watchdog_running = 0; +} + +static inline void clocksource_reset_watchdog(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + static void clocksource_resume_watchdog(void) { - set_bit(0, &watchdog_resumed); + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + clocksource_reset_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); } -static void clocksource_check_watchdog(struct clocksource *cs) +static void clocksource_enqueue_watchdog(struct clocksource *cs) { - struct clocksource *cse; unsigned long flags; spin_lock_irqsave(&watchdog_lock, flags); if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { - int started = !list_empty(&watchdog_list); - + /* cs is a clocksource to be watched. */ list_add(&cs->wd_list, &watchdog_list); - if (!started && watchdog) { - watchdog_last = watchdog->read(watchdog); - watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, - cpumask_first(cpu_online_mask)); - } + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; } else { + /* cs is a watchdog. */ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - + /* Pick the best watchdog. */ if (!watchdog || cs->rating > watchdog->rating) { - if (watchdog) - del_timer(&watchdog_timer); watchdog = cs; - init_timer(&watchdog_timer); - watchdog_timer.function = clocksource_watchdog; - /* Reset watchdog cycles */ - list_for_each_entry(cse, &watchdog_list, wd_list) - cse->flags &= ~CLOCK_SOURCE_WATCHDOG; - /* Start if list is not empty */ - if (!list_empty(&watchdog_list)) { - watchdog_last = watchdog->read(watchdog); - watchdog_timer.expires = - jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, - cpumask_first(cpu_online_mask)); - } + clocksource_reset_watchdog(); + } + } + /* Check if the watchdog timer needs to be started. */ + clocksource_start_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_dequeue_watchdog(struct clocksource *cs) +{ + struct clocksource *tmp; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a watched clocksource. */ + list_del_init(&cs->wd_list); + } else if (cs == watchdog) { + /* Reset watchdog cycles */ + clocksource_reset_watchdog(); + /* Current watchdog is removed. Find an alternative. */ + watchdog = NULL; + list_for_each_entry(tmp, &clocksource_list, list) { + if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + if (!watchdog || tmp->rating > watchdog->rating) + watchdog = tmp; } } + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } -#else -static void clocksource_check_watchdog(struct clocksource *cs) + +static int clocksource_watchdog_kthread(void *data) +{ + struct clocksource *cs, *tmp; + unsigned long flags; + LIST_HEAD(unstable); + + mutex_lock(&clocksource_mutex); + spin_lock_irqsave(&watchdog_lock, flags); + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + list_del_init(&cs->wd_list); + list_add(&cs->wd_list, &unstable); + } + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); + + /* Needs to be done outside of watchdog lock */ + list_for_each_entry_safe(cs, tmp, &unstable, wd_list) { + list_del_init(&cs->wd_list); + __clocksource_change_rating(cs, 0); + } + mutex_unlock(&clocksource_mutex); + return 0; +} + +#else /* CONFIG_CLOCKSOURCE_WATCHDOG */ + +static void clocksource_enqueue_watchdog(struct clocksource *cs) { if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } +static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -#endif +static inline int clocksource_watchdog_kthread(void *data) { return 0; } + +#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ /** * clocksource_resume - resume the clocksource(s) @@ -294,18 +393,16 @@ static inline void clocksource_resume_watchdog(void) { } void clocksource_resume(void) { struct clocksource *cs; - unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); - list_for_each_entry(cs, &clocksource_list, list) { + list_for_each_entry(cs, &clocksource_list, list) if (cs->resume) cs->resume(); - } clocksource_resume_watchdog(); - spin_unlock_irqrestore(&clocksource_lock, flags); + mutex_unlock(&clocksource_mutex); } /** @@ -320,75 +417,94 @@ void clocksource_touch_watchdog(void) clocksource_resume_watchdog(); } +#ifdef CONFIG_GENERIC_TIME + /** - * clocksource_get_next - Returns the selected clocksource + * clocksource_select - Select the best clocksource available + * + * Private function. Must hold clocksource_mutex when called. * + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. */ -struct clocksource *clocksource_get_next(void) +static void clocksource_select(void) { - unsigned long flags; + struct clocksource *best, *cs; - spin_lock_irqsave(&clocksource_lock, flags); - if (next_clocksource && finished_booting) { - curr_clocksource = next_clocksource; - next_clocksource = NULL; + if (!finished_booting || list_empty(&clocksource_list)) + return; + /* First clocksource on the list has the best rating. */ + best = list_first_entry(&clocksource_list, struct clocksource, list); + /* Check for the override clocksource. */ + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, override_name) != 0) + continue; + /* + * Check to make sure we don't switch to a non-highres + * capable clocksource if the tick code is in oneshot + * mode (highres or nohz) + */ + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && + tick_oneshot_mode_active()) { + /* Override clocksource cannot be used. */ + printk(KERN_WARNING "Override clocksource %s is not " + "HRT compatible. Cannot switch while in " + "HRT/NOHZ mode\n", cs->name); + override_name[0] = 0; + } else + /* Override clocksource can be used. */ + best = cs; + break; + } + if (curr_clocksource != best) { + printk(KERN_INFO "Switching to clocksource %s\n", best->name); + curr_clocksource = best; + timekeeping_notify(curr_clocksource); } - spin_unlock_irqrestore(&clocksource_lock, flags); - - return curr_clocksource; } -/** - * select_clocksource - Selects the best registered clocksource. - * - * Private function. Must hold clocksource_lock when called. +#else /* CONFIG_GENERIC_TIME */ + +static inline void clocksource_select(void) { } + +#endif + +/* + * clocksource_done_booting - Called near the end of core bootup * - * Select the clocksource with the best rating, or the clocksource, - * which is selected by userspace override. + * Hack to avoid lots of clocksource churn at boot time. + * We use fs_initcall because we want this to start before + * device_initcall but after subsys_initcall. */ -static struct clocksource *select_clocksource(void) +static int __init clocksource_done_booting(void) { - struct clocksource *next; - - if (list_empty(&clocksource_list)) - return NULL; - - if (clocksource_override) - next = clocksource_override; - else - next = list_entry(clocksource_list.next, struct clocksource, - list); + finished_booting = 1; - if (next == curr_clocksource) - return NULL; + /* + * Run the watchdog first to eliminate unstable clock sources + */ + clocksource_watchdog_kthread(NULL); - return next; + mutex_lock(&clocksource_mutex); + clocksource_select(); + mutex_unlock(&clocksource_mutex); + return 0; } +fs_initcall(clocksource_done_booting); /* * Enqueue the clocksource sorted by rating */ -static int clocksource_enqueue(struct clocksource *c) +static void clocksource_enqueue(struct clocksource *cs) { - struct list_head *tmp, *entry = &clocksource_list; + struct list_head *entry = &clocksource_list; + struct clocksource *tmp; - list_for_each(tmp, &clocksource_list) { - struct clocksource *cs; - - cs = list_entry(tmp, struct clocksource, list); - if (cs == c) - return -EBUSY; + list_for_each_entry(tmp, &clocksource_list, list) /* Keep track of the place, where to insert */ - if (cs->rating >= c->rating) - entry = tmp; - } - list_add(&c->list, entry); - - if (strlen(c->name) == strlen(override_name) && - !strcmp(c->name, override_name)) - clocksource_override = c; - - return 0; + if (tmp->rating >= cs->rating) + entry = &tmp->list; + list_add(&cs->list, entry); } /** @@ -397,52 +513,48 @@ static int clocksource_enqueue(struct clocksource *c) * * Returns -EBUSY if registration fails, zero otherwise. */ -int clocksource_register(struct clocksource *c) +int clocksource_register(struct clocksource *cs) { - unsigned long flags; - int ret; - - spin_lock_irqsave(&clocksource_lock, flags); - ret = clocksource_enqueue(c); - if (!ret) - next_clocksource = select_clocksource(); - spin_unlock_irqrestore(&clocksource_lock, flags); - if (!ret) - clocksource_check_watchdog(c); - return ret; + mutex_lock(&clocksource_mutex); + clocksource_enqueue(cs); + clocksource_select(); + clocksource_enqueue_watchdog(cs); + mutex_unlock(&clocksource_mutex); + return 0; } EXPORT_SYMBOL(clocksource_register); +static void __clocksource_change_rating(struct clocksource *cs, int rating) +{ + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); + clocksource_select(); +} + /** * clocksource_change_rating - Change the rating of a registered clocksource - * */ void clocksource_change_rating(struct clocksource *cs, int rating) { - unsigned long flags; - - spin_lock_irqsave(&clocksource_lock, flags); - list_del(&cs->list); - cs->rating = rating; - clocksource_enqueue(cs); - next_clocksource = select_clocksource(); - spin_unlock_irqrestore(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); + __clocksource_change_rating(cs, rating); + mutex_unlock(&clocksource_mutex); } +EXPORT_SYMBOL(clocksource_change_rating); /** * clocksource_unregister - remove a registered clocksource */ void clocksource_unregister(struct clocksource *cs) { - unsigned long flags; - - spin_lock_irqsave(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); + clocksource_dequeue_watchdog(cs); list_del(&cs->list); - if (clocksource_override == cs) - clocksource_override = NULL; - next_clocksource = select_clocksource(); - spin_unlock_irqrestore(&clocksource_lock, flags); + clocksource_select(); + mutex_unlock(&clocksource_mutex); } +EXPORT_SYMBOL(clocksource_unregister); #ifdef CONFIG_SYSFS /** @@ -458,9 +570,9 @@ sysfs_show_current_clocksources(struct sys_device *dev, { ssize_t count = 0; - spin_lock_irq(&clocksource_lock); + mutex_lock(&clocksource_mutex); count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); - spin_unlock_irq(&clocksource_lock); + mutex_unlock(&clocksource_mutex); return count; } @@ -478,9 +590,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, struct sysdev_attribute *attr, const char *buf, size_t count) { - struct clocksource *ovr = NULL; size_t ret = count; - int len; /* strings from sysfs write are not 0 terminated! */ if (count >= sizeof(override_name)) @@ -490,44 +600,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, if (buf[count-1] == '\n') count--; - spin_lock_irq(&clocksource_lock); + mutex_lock(&clocksource_mutex); if (count > 0) memcpy(override_name, buf, count); override_name[count] = 0; + clocksource_select(); - len = strlen(override_name); - if (len) { - struct clocksource *cs; - - ovr = clocksource_override; - /* try to select it: */ - list_for_each_entry(cs, &clocksource_list, list) { - if (strlen(cs->name) == len && - !strcmp(cs->name, override_name)) - ovr = cs; - } - } - - /* - * Check to make sure we don't switch to a non-highres capable - * clocksource if the tick code is in oneshot mode (highres or nohz) - */ - if (tick_oneshot_mode_active() && ovr && - !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) { - printk(KERN_WARNING "%s clocksource is not HRT compatible. " - "Cannot switch while in HRT/NOHZ mode\n", ovr->name); - ovr = NULL; - override_name[0] = 0; - } - - /* Reselect, when the override name has changed */ - if (ovr != clocksource_override) { - clocksource_override = ovr; - next_clocksource = select_clocksource(); - } - - spin_unlock_irq(&clocksource_lock); + mutex_unlock(&clocksource_mutex); return ret; } @@ -547,7 +627,7 @@ sysfs_show_available_clocksources(struct sys_device *dev, struct clocksource *src; ssize_t count = 0; - spin_lock_irq(&clocksource_lock); + mutex_lock(&clocksource_mutex); list_for_each_entry(src, &clocksource_list, list) { /* * Don't show non-HRES clocksource if the tick code is @@ -559,7 +639,7 @@ sysfs_show_available_clocksources(struct sys_device *dev, max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "%s ", src->name); } - spin_unlock_irq(&clocksource_lock); + mutex_unlock(&clocksource_mutex); count += snprintf(buf + count, max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); @@ -614,11 +694,10 @@ device_initcall(init_clocksource_sysfs); */ static int __init boot_override_clocksource(char* str) { - unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); if (str) strlcpy(override_name, str, sizeof(override_name)); - spin_unlock_irqrestore(&clocksource_lock, flags); + mutex_unlock(&clocksource_mutex); return 1; } diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index c3f6c30..5404a84 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = { .read = jiffies_read, .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ - .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT, .shift = JIFFIES_SHIFT, }; @@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void) } core_initcall(init_jiffies_clocksource); + +struct clocksource * __init __weak clocksource_default_clock(void) +{ + return &clocksource_jiffies; +} diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7fc6437..4800f93 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) case TIME_OK: break; case TIME_INS: - xtime.tv_sec--; - wall_to_monotonic.tv_sec++; + timekeeping_leap_insert(-1); time_state = TIME_OOP; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); @@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) res = HRTIMER_RESTART; break; case TIME_DEL: - xtime.tv_sec++; + timekeeping_leap_insert(1); time_tai--; - wall_to_monotonic.tv_sec--; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); @@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) time_state = TIME_OK; break; } - update_vsyscall(&xtime, clock); write_sequnlock(&xtime_lock); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e8c77d9..fb0f46f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -18,7 +18,117 @@ #include <linux/jiffies.h> #include <linux/time.h> #include <linux/tick.h> +#include <linux/stop_machine.h> + +/* Structure holding internal timekeeping values. */ +struct timekeeper { + /* Current clocksource used for timekeeping. */ + struct clocksource *clock; + /* The shift value of the current clocksource. */ + int shift; + + /* Number of clock cycles in one NTP interval. */ + cycle_t cycle_interval; + /* Number of clock shifted nano seconds in one NTP interval. */ + u64 xtime_interval; + /* Raw nano seconds accumulated per NTP interval. */ + u32 raw_interval; + + /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */ + u64 xtime_nsec; + /* Difference between accumulated time and NTP time in ntp + * shifted nano seconds. */ + s64 ntp_error; + /* Shift conversion between clock shifted nano seconds and + * ntp shifted nano seconds. */ + int ntp_error_shift; + /* NTP adjusted clock multiplier */ + u32 mult; +}; + +struct timekeeper timekeeper; + +/** + * timekeeper_setup_internals - Set up internals to use clocksource clock. + * + * @clock: Pointer to clocksource. + * + * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment + * pair and interval request. + * + * Unless you're the timekeeping code, you should not be using this! + */ +static void timekeeper_setup_internals(struct clocksource *clock) +{ + cycle_t interval; + u64 tmp; + + timekeeper.clock = clock; + clock->cycle_last = clock->read(clock); + /* Do the ns -> cycle conversion first, using original mult */ + tmp = NTP_INTERVAL_LENGTH; + tmp <<= clock->shift; + tmp += clock->mult/2; + do_div(tmp, clock->mult); + if (tmp == 0) + tmp = 1; + + interval = (cycle_t) tmp; + timekeeper.cycle_interval = interval; + + /* Go back from cycles -> shifted ns */ + timekeeper.xtime_interval = (u64) interval * clock->mult; + timekeeper.raw_interval = + ((u64) interval * clock->mult) >> clock->shift; + + timekeeper.xtime_nsec = 0; + timekeeper.shift = clock->shift; + + timekeeper.ntp_error = 0; + timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + + /* + * The timekeeper keeps its own mult values for the currently + * active clocksource. These value will be adjusted via NTP + * to counteract clock drifting. + */ + timekeeper.mult = clock->mult; +} + +/* Timekeeper helper functions. */ +static inline s64 timekeeping_get_ns(void) +{ + cycle_t cycle_now, cycle_delta; + struct clocksource *clock; + + /* read clocksource: */ + clock = timekeeper.clock; + cycle_now = clock->read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* return delta convert to nanoseconds using ntp adjusted mult. */ + return clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); +} + +static inline s64 timekeeping_get_ns_raw(void) +{ + cycle_t cycle_now, cycle_delta; + struct clocksource *clock; + + /* read clocksource: */ + clock = timekeeper.clock; + cycle_now = clock->read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* return delta convert to nanoseconds using ntp adjusted mult. */ + return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); +} /* * This read-write spinlock protects us from races in SMP while @@ -44,7 +154,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); */ struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); -static unsigned long total_sleep_time; /* seconds */ +static struct timespec total_sleep_time; + +/* + * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. + */ +struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -56,35 +171,44 @@ void update_xtime_cache(u64 nsec) timespec_add_ns(&xtime_cache, nsec); } -struct clocksource *clock; - +/* must hold xtime_lock */ +void timekeeping_leap_insert(int leapsecond) +{ + xtime.tv_sec += leapsecond; + wall_to_monotonic.tv_sec -= leapsecond; + update_vsyscall(&xtime, timekeeper.clock); +} #ifdef CONFIG_GENERIC_TIME + /** - * clocksource_forward_now - update clock to the current time + * timekeeping_forward_now - update clock to the current time * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ -static void clocksource_forward_now(void) +static void timekeeping_forward_now(void) { cycle_t cycle_now, cycle_delta; + struct clocksource *clock; s64 nsec; - cycle_now = clocksource_read(clock); + clock = timekeeper.clock; + cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; clock->cycle_last = cycle_now; - nsec = cyc2ns(clock, cycle_delta); + nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); /* If arch requires, add in gettimeoffset() */ nsec += arch_gettimeoffset(); timespec_add_ns(&xtime, nsec); - nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; - clock->raw_time.tv_nsec += nsec; + nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + timespec_add_ns(&raw_time, nsec); } /** @@ -95,7 +219,6 @@ static void clocksource_forward_now(void) */ void getnstimeofday(struct timespec *ts) { - cycle_t cycle_now, cycle_delta; unsigned long seq; s64 nsecs; @@ -105,15 +228,7 @@ void getnstimeofday(struct timespec *ts) seq = read_seqbegin(&xtime_lock); *ts = xtime; - - /* read clocksource: */ - cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ - nsecs = cyc2ns(clock, cycle_delta); + nsecs = timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); @@ -125,6 +240,57 @@ void getnstimeofday(struct timespec *ts) EXPORT_SYMBOL(getnstimeofday); +ktime_t ktime_get(void) +{ + unsigned int seq; + s64 secs, nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + secs = xtime.tv_sec + wall_to_monotonic.tv_sec; + nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; + nsecs += timekeeping_get_ns(); + + } while (read_seqretry(&xtime_lock, seq)); + /* + * Use ktime_set/ktime_add_ns to create a proper ktime on + * 32-bit architectures without CONFIG_KTIME_SCALAR. + */ + return ktime_add_ns(ktime_set(secs, 0), nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get); + +/** + * ktime_get_ts - get the monotonic clock in timespec format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. + */ +void ktime_get_ts(struct timespec *ts) +{ + struct timespec tomono; + unsigned int seq; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + *ts = xtime; + tomono = wall_to_monotonic; + nsecs = timekeeping_get_ns(); + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, + ts->tv_nsec + tomono.tv_nsec + nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_ts); + /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set @@ -157,7 +323,7 @@ int do_settimeofday(struct timespec *tv) write_seqlock_irqsave(&xtime_lock, flags); - clocksource_forward_now(); + timekeeping_forward_now(); ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; @@ -167,10 +333,10 @@ int do_settimeofday(struct timespec *tv) update_xtime_cache(0); - clock->error = 0; + timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, clock); + update_vsyscall(&xtime, timekeeper.clock); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -187,44 +353,97 @@ EXPORT_SYMBOL(do_settimeofday); * * Accumulates current time interval and initializes new clocksource */ -static void change_clocksource(void) +static int change_clocksource(void *data) { struct clocksource *new, *old; - new = clocksource_get_next(); + new = (struct clocksource *) data; + + timekeeping_forward_now(); + if (!new->enable || new->enable(new) == 0) { + old = timekeeper.clock; + timekeeper_setup_internals(new); + if (old->disable) + old->disable(old); + } + return 0; +} - if (clock == new) +/** + * timekeeping_notify - Install a new clock source + * @clock: pointer to the clock source + * + * This function is called from clocksource.c after a new, better clock + * source has been registered. The caller holds the clocksource_mutex. + */ +void timekeeping_notify(struct clocksource *clock) +{ + if (timekeeper.clock == clock) return; + stop_machine(change_clocksource, clock, NULL); + tick_clock_notify(); +} - clocksource_forward_now(); +#else /* GENERIC_TIME */ - if (clocksource_enable(new)) - return; +static inline void timekeeping_forward_now(void) { } - new->raw_time = clock->raw_time; - old = clock; - clock = new; - clocksource_disable(old); +/** + * ktime_get - get the monotonic time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get(void) +{ + struct timespec now; - clock->cycle_last = 0; - clock->cycle_last = clocksource_read(clock); - clock->error = 0; - clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); + ktime_get_ts(&now); - tick_clock_notify(); + return timespec_to_ktime(now); +} +EXPORT_SYMBOL_GPL(ktime_get); - /* - * We're holding xtime lock and waking up klogd would deadlock - * us on enqueue. So no printing! - printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); - */ +/** + * ktime_get_ts - get the monotonic clock in timespec format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. + */ +void ktime_get_ts(struct timespec *ts) +{ + struct timespec tomono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + getnstimeofday(ts); + tomono = wall_to_monotonic; + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, + ts->tv_nsec + tomono.tv_nsec); } -#else -static inline void clocksource_forward_now(void) { } -static inline void change_clocksource(void) { } -#endif +EXPORT_SYMBOL_GPL(ktime_get_ts); + +#endif /* !GENERIC_TIME */ + +/** + * ktime_get_real - get the real (wall-) time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get_real(void) +{ + struct timespec now; + + getnstimeofday(&now); + + return timespec_to_ktime(now); +} +EXPORT_SYMBOL_GPL(ktime_get_real); /** * getrawmonotonic - Returns the raw monotonic time in a timespec @@ -236,21 +455,11 @@ void getrawmonotonic(struct timespec *ts) { unsigned long seq; s64 nsecs; - cycle_t cycle_now, cycle_delta; do { seq = read_seqbegin(&xtime_lock); - - /* read clocksource: */ - cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ - nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; - - *ts = clock->raw_time; + nsecs = timekeeping_get_ns_raw(); + *ts = raw_time; } while (read_seqretry(&xtime_lock, seq)); @@ -270,7 +479,7 @@ int timekeeping_valid_for_hres(void) do { seq = read_seqbegin(&xtime_lock); - ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqretry(&xtime_lock, seq)); @@ -278,17 +487,33 @@ int timekeeping_valid_for_hres(void) } /** - * read_persistent_clock - Return time in seconds from the persistent clock. + * read_persistent_clock - Return time from the persistent clock. * * Weak dummy function for arches that do not yet support it. - * Returns seconds from epoch using the battery backed persistent clock. - * Returns zero if unsupported. + * Reads the time from the battery backed persistent clock. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ -unsigned long __attribute__((weak)) read_persistent_clock(void) +void __attribute__((weak)) read_persistent_clock(struct timespec *ts) { - return 0; + ts->tv_sec = 0; + ts->tv_nsec = 0; +} + +/** + * read_boot_clock - Return time of the system start. + * + * Weak dummy function for arches that do not yet support it. + * Function to read the exact time the system has been started. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +void __attribute__((weak)) read_boot_clock(struct timespec *ts) +{ + ts->tv_sec = 0; + ts->tv_nsec = 0; } /* @@ -296,29 +521,40 @@ unsigned long __attribute__((weak)) read_persistent_clock(void) */ void __init timekeeping_init(void) { + struct clocksource *clock; unsigned long flags; - unsigned long sec = read_persistent_clock(); + struct timespec now, boot; + + read_persistent_clock(&now); + read_boot_clock(&boot); write_seqlock_irqsave(&xtime_lock, flags); ntp_init(); - clock = clocksource_get_next(); - clocksource_enable(clock); - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - clock->cycle_last = clocksource_read(clock); - - xtime.tv_sec = sec; - xtime.tv_nsec = 0; + clock = clocksource_default_clock(); + if (clock->enable) + clock->enable(clock); + timekeeper_setup_internals(clock); + + xtime.tv_sec = now.tv_sec; + xtime.tv_nsec = now.tv_nsec; + raw_time.tv_sec = 0; + raw_time.tv_nsec = 0; + if (boot.tv_sec == 0 && boot.tv_nsec == 0) { + boot.tv_sec = xtime.tv_sec; + boot.tv_nsec = xtime.tv_nsec; + } set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); + -boot.tv_sec, -boot.tv_nsec); update_xtime_cache(0); - total_sleep_time = 0; + total_sleep_time.tv_sec = 0; + total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); } /* time in seconds when suspend began */ -static unsigned long timekeeping_suspend_time; +static struct timespec timekeeping_suspend_time; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -331,24 +567,24 @@ static unsigned long timekeeping_suspend_time; static int timekeeping_resume(struct sys_device *dev) { unsigned long flags; - unsigned long now = read_persistent_clock(); + struct timespec ts; + + read_persistent_clock(&ts); clocksource_resume(); write_seqlock_irqsave(&xtime_lock, flags); - if (now && (now > timekeeping_suspend_time)) { - unsigned long sleep_length = now - timekeeping_suspend_time; - - xtime.tv_sec += sleep_length; - wall_to_monotonic.tv_sec -= sleep_length; - total_sleep_time += sleep_length; + if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { + ts = timespec_sub(ts, timekeeping_suspend_time); + xtime = timespec_add_safe(xtime, ts); + wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); + total_sleep_time = timespec_add_safe(total_sleep_time, ts); } update_xtime_cache(0); /* re-base the last cycle value */ - clock->cycle_last = 0; - clock->cycle_last = clocksource_read(clock); - clock->error = 0; + timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); + timekeeper.ntp_error = 0; timekeeping_suspended = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -366,10 +602,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) { unsigned long flags; - timekeeping_suspend_time = read_persistent_clock(); + read_persistent_clock(&timekeeping_suspend_time); write_seqlock_irqsave(&xtime_lock, flags); - clocksource_forward_now(); + timekeeping_forward_now(); timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -404,7 +640,7 @@ device_initcall(timekeeping_init_device); * If the error is already larger, we look ahead even further * to compensate for late or lost adjustments. */ -static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, +static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, s64 *offset) { s64 tick_error, i; @@ -420,7 +656,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * here. This is tuned so that an error of about 1 msec is adjusted * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). */ - error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); error2 = abs(error2); for (look_ahead = 0; error2 > 0; look_ahead++) error2 >>= 2; @@ -429,8 +665,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); - tick_error -= clock->xtime_interval >> 1; + tick_error = tick_length >> (timekeeper.ntp_error_shift + 1); + tick_error -= timekeeper.xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; /* Finally calculate the adjustment shift value. */ @@ -455,18 +691,18 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * this is optimized for the most common adjustments of -1,0,1, * for other values we can do a bit more work. */ -static void clocksource_adjust(s64 offset) +static void timekeeping_adjust(s64 offset) { - s64 error, interval = clock->cycle_interval; + s64 error, interval = timekeeper.cycle_interval; int adj; - error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); + error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); if (error > interval) { error >>= 2; if (likely(error <= interval)) adj = 1; else - adj = clocksource_bigadjust(error, &interval, &offset); + adj = timekeeping_bigadjust(error, &interval, &offset); } else if (error < -interval) { error >>= 2; if (likely(error >= -interval)) { @@ -474,15 +710,15 @@ static void clocksource_adjust(s64 offset) interval = -interval; offset = -offset; } else - adj = clocksource_bigadjust(error, &interval, &offset); + adj = timekeeping_bigadjust(error, &interval, &offset); } else return; - clock->mult += adj; - clock->xtime_interval += interval; - clock->xtime_nsec -= offset; - clock->error -= (interval - offset) << - (NTP_SCALE_SHIFT - clock->shift); + timekeeper.mult += adj; + timekeeper.xtime_interval += interval; + timekeeper.xtime_nsec -= offset; + timekeeper.ntp_error -= (interval - offset) << + timekeeper.ntp_error_shift; } /** @@ -492,53 +728,59 @@ static void clocksource_adjust(s64 offset) */ void update_wall_time(void) { + struct clocksource *clock; cycle_t offset; + u64 nsecs; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return; + clock = timekeeper.clock; #ifdef CONFIG_GENERIC_TIME - offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; + offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #else - offset = clock->cycle_interval; + offset = timekeeper.cycle_interval; #endif - clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; + timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. */ - while (offset >= clock->cycle_interval) { + while (offset >= timekeeper.cycle_interval) { + u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; + /* accumulate one interval */ - offset -= clock->cycle_interval; - clock->cycle_last += clock->cycle_interval; + offset -= timekeeper.cycle_interval; + clock->cycle_last += timekeeper.cycle_interval; - clock->xtime_nsec += clock->xtime_interval; - if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { - clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; + timekeeper.xtime_nsec += timekeeper.xtime_interval; + if (timekeeper.xtime_nsec >= nsecps) { + timekeeper.xtime_nsec -= nsecps; xtime.tv_sec++; second_overflow(); } - clock->raw_time.tv_nsec += clock->raw_interval; - if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { - clock->raw_time.tv_nsec -= NSEC_PER_SEC; - clock->raw_time.tv_sec++; + raw_time.tv_nsec += timekeeper.raw_interval; + if (raw_time.tv_nsec >= NSEC_PER_SEC) { + raw_time.tv_nsec -= NSEC_PER_SEC; + raw_time.tv_sec++; } /* accumulate error between NTP and clock interval */ - clock->error += tick_length; - clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); + timekeeper.ntp_error += tick_length; + timekeeper.ntp_error -= timekeeper.xtime_interval << + timekeeper.ntp_error_shift; } /* correct the clock when NTP error is too big */ - clocksource_adjust(offset); + timekeeping_adjust(offset); /* * Since in the loop above, we accumulate any amount of time * in xtime_nsec over a second into xtime.tv_sec, its possible for * xtime_nsec to be fairly small after the loop. Further, if we're - * slightly speeding the clocksource up in clocksource_adjust(), + * slightly speeding the clocksource up in timekeeping_adjust(), * its possible the required corrective factor to xtime_nsec could * cause it to underflow. * @@ -550,24 +792,25 @@ void update_wall_time(void) * We'll correct this error next time through this function, when * xtime_nsec is not as small. */ - if (unlikely((s64)clock->xtime_nsec < 0)) { - s64 neg = -(s64)clock->xtime_nsec; - clock->xtime_nsec = 0; - clock->error += neg << (NTP_SCALE_SHIFT - clock->shift); + if (unlikely((s64)timekeeper.xtime_nsec < 0)) { + s64 neg = -(s64)timekeeper.xtime_nsec; + timekeeper.xtime_nsec = 0; + timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; } /* store full nanoseconds into xtime after rounding it up and * add the remainder to the error difference. */ - xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; - clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; - clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); + xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; + timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift; + timekeeper.ntp_error += timekeeper.xtime_nsec << + timekeeper.ntp_error_shift; - update_xtime_cache(cyc2ns(clock, offset)); + nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); + update_xtime_cache(nsecs); /* check to see if there is a new clocksource to use */ - change_clocksource(); - update_vsyscall(&xtime, clock); + update_vsyscall(&xtime, timekeeper.clock); } /** @@ -583,9 +826,12 @@ void update_wall_time(void) */ void getboottime(struct timespec *ts) { - set_normalized_timespec(ts, - - (wall_to_monotonic.tv_sec + total_sleep_time), - - wall_to_monotonic.tv_nsec); + struct timespec boottime = { + .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec, + .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec + }; + + set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } /** @@ -594,7 +840,7 @@ void getboottime(struct timespec *ts) */ void monotonic_to_bootbased(struct timespec *ts) { - ts->tv_sec += total_sleep_time; + *ts = timespec_add_safe(*ts, total_sleep_time); } unsigned long get_seconds(void) @@ -603,6 +849,10 @@ unsigned long get_seconds(void) } EXPORT_SYMBOL(get_seconds); +struct timespec __current_kernel_time(void) +{ + return xtime_cache; +} struct timespec current_kernel_time(void) { @@ -618,3 +868,20 @@ struct timespec current_kernel_time(void) return now; } EXPORT_SYMBOL(current_kernel_time); + +struct timespec get_monotonic_coarse(void) +{ + struct timespec now, mono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + + now = xtime_cache; + mono = wall_to_monotonic; + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, + now.tv_nsec + mono.tv_nsec); + return now; +} diff --git a/kernel/timer.c b/kernel/timer.c index a7f07d5..5db5a8d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,7 +37,7 @@ #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/sched.h> #include <asm/uaccess.h> @@ -46,6 +46,9 @@ #include <asm/timex.h> #include <asm/io.h> +#define CREATE_TRACE_POINTS +#include <trace/events/timer.h> + u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); @@ -72,6 +75,7 @@ struct tvec_base { spinlock_t lock; struct timer_list *running_timer; unsigned long timer_jiffies; + unsigned long next_timer; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -520,6 +524,25 @@ static inline void debug_timer_activate(struct timer_list *timer) { } static inline void debug_timer_deactivate(struct timer_list *timer) { } #endif +static inline void debug_init(struct timer_list *timer) +{ + debug_timer_init(timer); + trace_timer_init(timer); +} + +static inline void +debug_activate(struct timer_list *timer, unsigned long expires) +{ + debug_timer_activate(timer); + trace_timer_start(timer, expires); +} + +static inline void debug_deactivate(struct timer_list *timer) +{ + debug_timer_deactivate(timer); + trace_timer_cancel(timer); +} + static void __init_timer(struct timer_list *timer, const char *name, struct lock_class_key *key) @@ -548,7 +571,7 @@ void init_timer_key(struct timer_list *timer, const char *name, struct lock_class_key *key) { - debug_timer_init(timer); + debug_init(timer); __init_timer(timer, name, key); } EXPORT_SYMBOL(init_timer_key); @@ -567,7 +590,7 @@ static inline void detach_timer(struct timer_list *timer, { struct list_head *entry = &timer->entry; - debug_timer_deactivate(timer); + debug_deactivate(timer); __list_del(entry->prev, entry->next); if (clear_pending) @@ -622,13 +645,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, if (timer_pending(timer)) { detach_timer(timer, 0); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } else { if (pending_only) goto out_unlock; } - debug_timer_activate(timer); + debug_activate(timer, expires); new_base = __get_cpu_var(tvec_bases); @@ -663,6 +689,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, } timer->expires = expires; + if (time_before(timer->expires, base->next_timer) && + !tbase_get_deferrable(timer->base)) + base->next_timer = timer->expires; internal_add_timer(base, timer); out_unlock: @@ -780,7 +809,10 @@ void add_timer_on(struct timer_list *timer, int cpu) BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); - debug_timer_activate(timer); + debug_activate(timer, timer->expires); + if (time_before(timer->expires, base->next_timer) && + !tbase_get_deferrable(timer->base)) + base->next_timer = timer->expires; internal_add_timer(base, timer); /* * Check whether the other CPU is idle and needs to be @@ -817,6 +849,9 @@ int del_timer(struct timer_list *timer) base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { detach_timer(timer, 1); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } spin_unlock_irqrestore(&base->lock, flags); @@ -850,6 +885,9 @@ int try_to_del_timer_sync(struct timer_list *timer) ret = 0; if (timer_pending(timer)) { detach_timer(timer, 1); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } out: @@ -984,7 +1022,9 @@ static inline void __run_timers(struct tvec_base *base) */ lock_map_acquire(&lockdep_map); + trace_timer_expire_entry(timer); fn(data); + trace_timer_expire_exit(timer); lock_map_release(&lockdep_map); @@ -1007,8 +1047,8 @@ static inline void __run_timers(struct tvec_base *base) #ifdef CONFIG_NO_HZ /* * Find out when the next timer event is due to happen. This - * is used on S/390 to stop all activity when a cpus is idle. - * This functions needs to be called disabled. + * is used on S/390 to stop all activity when a CPU is idle. + * This function needs to be called with interrupts disabled. */ static unsigned long __next_timer_interrupt(struct tvec_base *base) { @@ -1134,7 +1174,9 @@ unsigned long get_next_timer_interrupt(unsigned long now) unsigned long expires; spin_lock(&base->lock); - expires = __next_timer_interrupt(base); + if (time_before_eq(base->next_timer, base->timer_jiffies)) + base->next_timer = __next_timer_interrupt(base); + expires = base->next_timer; spin_unlock(&base->lock); if (time_before_eq(expires, now)) @@ -1156,8 +1198,7 @@ void update_process_times(int user_tick) /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); - if (rcu_pending(cpu)) - rcu_check_callbacks(cpu, user_tick); + rcu_check_callbacks(cpu, user_tick); printk_tick(); scheduler_tick(); run_posix_cpu_timers(p); @@ -1170,7 +1211,7 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); - perf_counter_do_pending(); + perf_event_do_pending(); hrtimer_run_pending(); @@ -1523,6 +1564,7 @@ static int __cpuinit init_timers_cpu(int cpu) INIT_LIST_HEAD(base->tv1.vec + j); base->timer_jiffies = jiffies; + base->next_timer = base->timer_jiffies; return 0; } @@ -1535,6 +1577,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea timer = list_first_entry(head, struct timer_list, entry); detach_timer(timer, 0); timer_set_base(timer, new_base); + if (time_before(timer->expires, new_base->next_timer) && + !tbase_get_deferrable(timer->base)) + new_base->next_timer = timer->expires; internal_add_timer(new_base, timer); } } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e78dcbd..15372a9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -11,12 +11,18 @@ config NOP_TRACER config HAVE_FTRACE_NMI_ENTER bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FUNCTION_TRACER bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FUNCTION_GRAPH_TRACER bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FUNCTION_GRAPH_FP_TEST bool @@ -28,21 +34,25 @@ config HAVE_FUNCTION_GRAPH_FP_TEST config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool help - This gets selected when the arch tests the function_trace_stop - variable at the mcount call site. Otherwise, this variable - is tested by the called function. + See Documentation/trace/ftrace-implementation.txt config HAVE_DYNAMIC_FTRACE bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FTRACE_MCOUNT_RECORD bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_HW_BRANCH_TRACER bool config HAVE_SYSCALL_TRACEPOINTS bool + help + See Documentation/trace/ftrace-implementation.txt config TRACER_MAX_TRACE bool @@ -73,7 +83,7 @@ config RING_BUFFER_ALLOW_SWAP # This allows those options to appear when no other tracer is selected. But the # options do not appear when something else selects it. We need the two options # GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the -# hidding of the automatic options options. +# hidding of the automatic options. config TRACING bool @@ -481,6 +491,18 @@ config FTRACE_STARTUP_TEST functioning properly. It will do tests on all the configured tracers of ftrace. +config EVENT_TRACE_TEST_SYSCALLS + bool "Run selftest on syscall events" + depends on FTRACE_STARTUP_TEST + help + This option will also enable testing every syscall event. + It only enables the event and disables it and runs various loads + with the event enabled. This adds a bit more time for kernel boot + up since it runs this on every system call defined. + + TBD - enable a way to actually call the syscalls as we test their + events + config MMIOTRACE bool "Memory mapped IO tracing" depends on HAVE_MMIOTRACE_SUPPORT && PCI diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 7c00a1e..c8cb75d 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o -obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o @@ -55,5 +54,6 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_TRACER) += trace_kprobe.o +obj-$(CONFIG_EVENT_TRACING) += power-traces.o libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8c804e2..23df7771 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1323,11 +1323,10 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) enum { FTRACE_ITER_FILTER = (1 << 0), - FTRACE_ITER_CONT = (1 << 1), - FTRACE_ITER_NOTRACE = (1 << 2), - FTRACE_ITER_FAILURES = (1 << 3), - FTRACE_ITER_PRINTALL = (1 << 4), - FTRACE_ITER_HASH = (1 << 5), + FTRACE_ITER_NOTRACE = (1 << 1), + FTRACE_ITER_FAILURES = (1 << 2), + FTRACE_ITER_PRINTALL = (1 << 3), + FTRACE_ITER_HASH = (1 << 4), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -1337,8 +1336,7 @@ struct ftrace_iterator { int hidx; int idx; unsigned flags; - unsigned char buffer[FTRACE_BUFF_MAX+1]; - unsigned buffer_idx; + struct trace_parser parser; }; static void * @@ -1407,7 +1405,7 @@ static int t_hash_show(struct seq_file *m, void *v) if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); - seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func); + seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func); if (rec->data) seq_printf(m, ":%p", rec->data); @@ -1517,12 +1515,12 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; - seq_printf(m, "%pf\n", (void *)rec->ip); + seq_printf(m, "%ps\n", (void *)rec->ip); return 0; } -static struct seq_operations show_ftrace_seq_ops = { +static const struct seq_operations show_ftrace_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, @@ -1604,6 +1602,11 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (!iter) return -ENOMEM; + if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) { + kfree(iter); + return -ENOMEM; + } + mutex_lock(&ftrace_regex_lock); if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) @@ -2059,9 +2062,9 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int i, len = 0; char *search; - if (glob && (strcmp(glob, "*") || !strlen(glob))) + if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) glob = NULL; - else { + else if (glob) { int not; type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); @@ -2196,9 +2199,8 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos, int enable) { struct ftrace_iterator *iter; - char ch; - size_t read = 0; - ssize_t ret; + struct trace_parser *parser; + ssize_t ret, read; if (!cnt || cnt < 0) return 0; @@ -2211,72 +2213,23 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, } else iter = file->private_data; - if (!*ppos) { - iter->flags &= ~FTRACE_ITER_CONT; - iter->buffer_idx = 0; - } + parser = &iter->parser; + read = trace_get_user(parser, ubuf, cnt, ppos); - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; - - /* - * If the parser haven't finished with the last write, - * continue reading the user input without skipping spaces. - */ - if (!(iter->flags & FTRACE_ITER_CONT)) { - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; - } - - /* only spaces were written */ - if (isspace(ch)) { - *ppos += read; - ret = read; - goto out; - } - - iter->buffer_idx = 0; - } - - while (cnt && !isspace(ch)) { - if (iter->buffer_idx < FTRACE_BUFF_MAX) - iter->buffer[iter->buffer_idx++] = ch; - else { - ret = -EINVAL; - goto out; - } - ret = get_user(ch, ubuf++); + if (trace_parser_loaded(parser) && + !trace_parser_cont(parser)) { + ret = ftrace_process_regex(parser->buffer, + parser->idx, enable); if (ret) goto out; - read++; - cnt--; - } - if (isspace(ch)) { - iter->buffer[iter->buffer_idx] = 0; - ret = ftrace_process_regex(iter->buffer, - iter->buffer_idx, enable); - if (ret) - goto out; - iter->buffer_idx = 0; - } else { - iter->flags |= FTRACE_ITER_CONT; - iter->buffer[iter->buffer_idx++] = ch; + trace_parser_clear(parser); } - *ppos += read; ret = read; - out: - mutex_unlock(&ftrace_regex_lock); + mutex_unlock(&ftrace_regex_lock); +out: return ret; } @@ -2381,6 +2334,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; + struct trace_parser *parser; mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { @@ -2390,9 +2344,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) } else iter = file->private_data; - if (iter->buffer_idx) { - iter->buffer[iter->buffer_idx] = 0; - ftrace_match_records(iter->buffer, iter->buffer_idx, enable); + parser = &iter->parser; + if (trace_parser_loaded(parser)) { + parser->buffer[parser->idx] = 0; + ftrace_match_records(parser->buffer, parser->idx, enable); } mutex_lock(&ftrace_lock); @@ -2400,7 +2355,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) ftrace_run_update_code(FTRACE_ENABLE_CALLS); mutex_unlock(&ftrace_lock); + trace_parser_put(parser); kfree(iter); + mutex_unlock(&ftrace_regex_lock); return 0; } @@ -2457,11 +2414,9 @@ unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; static void * __g_next(struct seq_file *m, loff_t *pos) { - unsigned long *array = m->private; - if (*pos >= ftrace_graph_count) return NULL; - return &array[*pos]; + return &ftrace_graph_funcs[*pos]; } static void * @@ -2499,12 +2454,12 @@ static int g_show(struct seq_file *m, void *v) return 0; } - seq_printf(m, "%pf\n", v); + seq_printf(m, "%ps\n", (void *)*ptr); return 0; } -static struct seq_operations ftrace_graph_seq_ops = { +static const struct seq_operations ftrace_graph_seq_ops = { .start = g_start, .next = g_next, .stop = g_stop, @@ -2525,16 +2480,10 @@ ftrace_graph_open(struct inode *inode, struct file *file) ftrace_graph_count = 0; memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); } + mutex_unlock(&graph_lock); - if (file->f_mode & FMODE_READ) { + if (file->f_mode & FMODE_READ) ret = seq_open(file, &ftrace_graph_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = ftrace_graph_funcs; - } - } else - file->private_data = ftrace_graph_funcs; - mutex_unlock(&graph_lock); return ret; } @@ -2602,12 +2551,9 @@ static ssize_t ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { - unsigned char buffer[FTRACE_BUFF_MAX+1]; - unsigned long *array; + struct trace_parser parser; size_t read = 0; ssize_t ret; - int index = 0; - char ch; if (!cnt || cnt < 0) return 0; @@ -2619,57 +2565,26 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, goto out; } - if (file->f_mode & FMODE_READ) { - struct seq_file *m = file->private_data; - array = m->private; - } else - array = file->private_data; - - ret = get_user(ch, ubuf++); - if (ret) + if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { + ret = -ENOMEM; goto out; - read++; - cnt--; - - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; } - if (isspace(ch)) { - *ppos += read; - ret = read; - goto out; - } + read = trace_get_user(&parser, ubuf, cnt, ppos); - while (cnt && !isspace(ch)) { - if (index < FTRACE_BUFF_MAX) - buffer[index++] = ch; - else { - ret = -EINVAL; - goto out; - } - ret = get_user(ch, ubuf++); + if (trace_parser_loaded((&parser))) { + parser.buffer[parser.idx] = 0; + + /* we allow only one expression at a time */ + ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, + parser.buffer); if (ret) goto out; - read++; - cnt--; } - buffer[index] = 0; - - /* we allow only one expression at a time */ - ret = ftrace_set_func(array, &ftrace_graph_count, buffer); - if (ret) - goto out; - - file->f_pos += read; ret = read; out: + trace_parser_put(&parser); mutex_unlock(&graph_lock); return ret; diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c new file mode 100644 index 0000000..e06c6e3 --- /dev/null +++ b/kernel/trace/power-traces.c @@ -0,0 +1,20 @@ +/* + * Power trace points + * + * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com> + */ + +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/slab.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/power.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(power_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); + diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 454e74e..d4ff019 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -201,8 +201,6 @@ int tracing_is_on(void) } EXPORT_SYMBOL_GPL(tracing_is_on); -#include "trace.h" - #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) @@ -701,8 +699,8 @@ static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, val &= ~RB_FLAG_MASK; - ret = (unsigned long)cmpxchg(&list->next, - val | old_flag, val | new_flag); + ret = cmpxchg((unsigned long *)&list->next, + val | old_flag, val | new_flag); /* check if the reader took the page */ if ((ret & ~RB_FLAG_MASK) != val) @@ -794,7 +792,7 @@ static int rb_head_page_replace(struct buffer_page *old, val = *ptr & ~RB_FLAG_MASK; val |= RB_PAGE_HEAD; - ret = cmpxchg(ptr, val, &new->list); + ret = cmpxchg(ptr, val, (unsigned long)&new->list); return ret == val; } @@ -2997,15 +2995,12 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) } static struct ring_buffer_event * -rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) { - struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; struct buffer_page *reader; int nr_loops = 0; - cpu_buffer = buffer->buffers[cpu]; - again: /* * We repeat when a timestamp is encountered. It is possible @@ -3049,7 +3044,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) case RINGBUF_TYPE_DATA: if (ts) { *ts = cpu_buffer->read_stamp + event->time_delta; - ring_buffer_normalize_time_stamp(buffer, + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } return event; @@ -3168,7 +3163,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) local_irq_save(flags); if (dolock) spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(buffer, cpu, ts); + event = rb_buffer_peek(cpu_buffer, ts); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); if (dolock) @@ -3237,7 +3232,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) if (dolock) spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(buffer, cpu, ts); + event = rb_buffer_peek(cpu_buffer, ts); if (event) rb_advance_reader(cpu_buffer); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c75dee..6c0f6a8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -125,13 +125,13 @@ int ftrace_dump_on_oops; static int tracing_set_tracer(const char *buf); -#define BOOTUP_TRACER_SIZE 100 -static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata; +#define MAX_TRACER_SIZE 100 +static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; static int __init set_ftrace(char *str) { - strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); + strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ ring_buffer_expanded = 1; @@ -242,13 +242,6 @@ static struct tracer *trace_types __read_mostly; static struct tracer *current_trace __read_mostly; /* - * max_tracer_type_len is used to simplify the allocating of - * buffers to read userspace tracer names. We keep track of - * the longest tracer name registered. - */ -static int max_tracer_type_len; - -/* * trace_types_lock is used to protect the trace_types list. * This lock is also used to keep user access serialized. * Accesses from userspace will grab this lock while userspace @@ -275,12 +268,18 @@ static DEFINE_SPINLOCK(tracing_start_lock); */ void trace_wake_up(void) { + int cpu; + + if (trace_flags & TRACE_ITER_BLOCK) + return; /* * The runqueue_is_locked() can fail, but this is the best we * have for now: */ - if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) + cpu = get_cpu(); + if (!runqueue_is_locked(cpu)) wake_up(&trace_wait); + put_cpu(); } static int __init set_buf_size(char *str) @@ -339,6 +338,112 @@ static struct { int trace_clock_id; +/* + * trace_parser_get_init - gets the buffer for trace parser + */ +int trace_parser_get_init(struct trace_parser *parser, int size) +{ + memset(parser, 0, sizeof(*parser)); + + parser->buffer = kmalloc(size, GFP_KERNEL); + if (!parser->buffer) + return 1; + + parser->size = size; + return 0; +} + +/* + * trace_parser_put - frees the buffer for trace parser + */ +void trace_parser_put(struct trace_parser *parser) +{ + kfree(parser->buffer); +} + +/* + * trace_get_user - reads the user input string separated by space + * (matched by isspace(ch)) + * + * For each string found the 'struct trace_parser' is updated, + * and the function returns. + * + * Returns number of bytes read. + * + * See kernel/trace/trace.h for 'struct trace_parser' details. + */ +int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char ch; + size_t read = 0; + ssize_t ret; + + if (!*ppos) + trace_parser_clear(parser); + + ret = get_user(ch, ubuf++); + if (ret) + goto out; + + read++; + cnt--; + + /* + * The parser is not finished with the last write, + * continue reading the user input without skipping spaces. + */ + if (!parser->cont) { + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* only spaces were written */ + if (isspace(ch)) { + *ppos += read; + ret = read; + goto out; + } + + parser->idx = 0; + } + + /* read the non-space input */ + while (cnt && !isspace(ch)) { + if (parser->idx < parser->size) + parser->buffer[parser->idx++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* We either got finished input or we have to wait for another call. */ + if (isspace(ch)) { + parser->buffer[parser->idx] = 0; + parser->cont = false; + } else { + parser->cont = true; + parser->buffer[parser->idx++] = ch; + } + + *ppos += read; + ret = read; + +out: + return ret; +} + ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) { int len; @@ -513,7 +618,6 @@ __releases(kernel_lock) __acquires(kernel_lock) { struct tracer *t; - int len; int ret = 0; if (!type->name) { @@ -521,6 +625,11 @@ __acquires(kernel_lock) return -1; } + if (strlen(type->name) > MAX_TRACER_SIZE) { + pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); + return -1; + } + /* * When this gets called we hold the BKL which means that * preemption is disabled. Various trace selftests however @@ -535,7 +644,7 @@ __acquires(kernel_lock) for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ - pr_info("Trace %s already registered\n", + pr_info("Tracer %s already registered\n", type->name); ret = -1; goto out; @@ -586,9 +695,6 @@ __acquires(kernel_lock) type->next = trace_types; trace_types = type; - len = strlen(type->name); - if (len > max_tracer_type_len) - max_tracer_type_len = len; out: tracing_selftest_running = false; @@ -597,7 +703,7 @@ __acquires(kernel_lock) if (ret || !default_bootup_tracer) goto out_unlock; - if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE)) + if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) goto out_unlock; printk(KERN_INFO "Starting tracer '%s'\n", type->name); @@ -619,14 +725,13 @@ __acquires(kernel_lock) void unregister_tracer(struct tracer *type) { struct tracer **t; - int len; mutex_lock(&trace_types_lock); for (t = &trace_types; *t; t = &(*t)->next) { if (*t == type) goto found; } - pr_info("Trace %s not registered\n", type->name); + pr_info("Tracer %s not registered\n", type->name); goto out; found: @@ -639,17 +744,7 @@ void unregister_tracer(struct tracer *type) current_trace->stop(&global_trace); current_trace = &nop_trace; } - - if (strlen(type->name) != max_tracer_type_len) - goto out; - - max_tracer_type_len = 0; - for (t = &trace_types; *t; t = &(*t)->next) { - len = strlen((*t)->name); - if (len > max_tracer_type_len) - max_tracer_type_len = len; - } - out: +out: mutex_unlock(&trace_types_lock); } @@ -719,6 +814,11 @@ static void trace_init_cmdlines(void) cmdline_idx = 0; } +int is_tracing_stopped(void) +{ + return trace_stop_count; +} + /** * ftrace_off_permanent - disable all ftrace code permanently * @@ -886,7 +986,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->tgid = (tsk) ? tsk->tgid : 0; + entry->lock_depth = (tsk) ? tsk->lock_depth : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1068,6 +1168,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) return; entry = ring_buffer_event_data(event); + entry->tgid = current->tgid; memset(&entry->caller, 0, sizeof(entry->caller)); trace.nr_entries = 0; @@ -1094,6 +1195,7 @@ ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, int pc) { + struct ftrace_event_call *call = &event_special; struct ring_buffer_event *event; struct trace_array *tr = __tr; struct ring_buffer *buffer = tr->buffer; @@ -1107,7 +1209,9 @@ ftrace_trace_special(void *__tr, entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; - trace_buffer_unlock_commit(buffer, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void @@ -1530,10 +1634,10 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# | / _----=> need-resched \n"); seq_puts(m, "# || / _---=> hardirq/softirq \n"); seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / \n"); - seq_puts(m, "# ||||| delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# |||| /_--=> lock-depth \n"); + seq_puts(m, "# |||||/ delay \n"); + seq_puts(m, "# cmd pid |||||| time | caller \n"); + seq_puts(m, "# \\ / |||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) @@ -1845,7 +1949,7 @@ static int s_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations tracer_seq_ops = { +static const struct seq_operations tracer_seq_ops = { .start = s_start, .next = s_next, .stop = s_stop, @@ -2059,7 +2163,7 @@ static int t_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations show_traces_seq_ops = { +static const struct seq_operations show_traces_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, @@ -2489,7 +2593,7 @@ static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[max_tracer_type_len+2]; + char buf[MAX_TRACER_SIZE+2]; int r; mutex_lock(&trace_types_lock); @@ -2639,15 +2743,15 @@ static ssize_t tracing_set_trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[max_tracer_type_len+1]; + char buf[MAX_TRACER_SIZE+1]; int i; size_t ret; int err; ret = cnt; - if (cnt > max_tracer_type_len) - cnt = max_tracer_type_len; + if (cnt > MAX_TRACER_SIZE) + cnt = MAX_TRACER_SIZE; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8210649..104c1a7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -7,10 +7,10 @@ #include <linux/clocksource.h> #include <linux/ring_buffer.h> #include <linux/mmiotrace.h> +#include <linux/tracepoint.h> #include <linux/ftrace.h> #include <trace/boot.h> #include <linux/kmemtrace.h> -#include <trace/power.h> #include <linux/trace_seq.h> #include <linux/ftrace_event.h> @@ -36,163 +36,59 @@ enum trace_type { TRACE_HW_BRANCHES, TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, - TRACE_POWER, TRACE_BLK, __TRACE_LAST_TYPE, }; -/* - * Function trace entry - function address and parent function addres: - */ -struct ftrace_entry { - struct trace_entry ent; - unsigned long ip; - unsigned long parent_ip; -}; - -/* Function call entry */ -struct ftrace_graph_ent_entry { - struct trace_entry ent; - struct ftrace_graph_ent graph_ent; +enum kmemtrace_type_id { + KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ + KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ + KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ }; -/* Function return entry */ -struct ftrace_graph_ret_entry { - struct trace_entry ent; - struct ftrace_graph_ret ret; -}; extern struct tracer boot_tracer; -/* - * Context switch trace entry - which task (and prio) we switched from/to: - */ -struct ctx_switch_entry { - struct trace_entry ent; - unsigned int prev_pid; - unsigned char prev_prio; - unsigned char prev_state; - unsigned int next_pid; - unsigned char next_prio; - unsigned char next_state; - unsigned int next_cpu; -}; - -/* - * Special (free-form) trace entry: - */ -struct special_entry { - struct trace_entry ent; - unsigned long arg1; - unsigned long arg2; - unsigned long arg3; -}; - -/* - * Stack-trace entry: - */ - -#define FTRACE_STACK_ENTRIES 8 +#undef __field +#define __field(type, item) type item; -struct stack_entry { - struct trace_entry ent; - unsigned long caller[FTRACE_STACK_ENTRIES]; -}; +#undef __field_struct +#define __field_struct(type, item) __field(type, item) -struct userstack_entry { - struct trace_entry ent; - unsigned long caller[FTRACE_STACK_ENTRIES]; -}; +#undef __field_desc +#define __field_desc(type, container, item) -/* - * trace_printk entry: - */ -struct bprint_entry { - struct trace_entry ent; - unsigned long ip; - const char *fmt; - u32 buf[]; -}; +#undef __array +#define __array(type, item, size) type item[size]; -struct print_entry { - struct trace_entry ent; - unsigned long ip; - char buf[]; -}; - -#define TRACE_OLD_SIZE 88 - -struct trace_field_cont { - unsigned char type; - /* Temporary till we get rid of this completely */ - char buf[TRACE_OLD_SIZE - 1]; -}; +#undef __array_desc +#define __array_desc(type, container, item, size) -struct trace_mmiotrace_rw { - struct trace_entry ent; - struct mmiotrace_rw rw; -}; +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; -struct trace_mmiotrace_map { - struct trace_entry ent; - struct mmiotrace_map map; -}; - -struct trace_boot_call { - struct trace_entry ent; - struct boot_trace_call boot_call; -}; +#undef F_STRUCT +#define F_STRUCT(args...) args -struct trace_boot_ret { - struct trace_entry ent; - struct boot_trace_ret boot_ret; -}; +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ + struct struct_name { \ + struct trace_entry ent; \ + tstruct \ + } -#define TRACE_FUNC_SIZE 30 -#define TRACE_FILE_SIZE 20 -struct trace_branch { - struct trace_entry ent; - unsigned line; - char func[TRACE_FUNC_SIZE+1]; - char file[TRACE_FILE_SIZE+1]; - char correct; -}; +#undef TP_ARGS +#define TP_ARGS(args...) args -struct hw_branch_entry { - struct trace_entry ent; - u64 from; - u64 to; -}; +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) -struct trace_power { - struct trace_entry ent; - struct power_trace state_data; -}; - -enum kmemtrace_type_id { - KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ - KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ - KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ -}; - -struct kmemtrace_alloc_entry { - struct trace_entry ent; - enum kmemtrace_type_id type_id; - unsigned long call_site; - const void *ptr; - size_t bytes_req; - size_t bytes_alloc; - gfp_t gfp_flags; - int node; -}; - -struct kmemtrace_free_entry { - struct trace_entry ent; - enum kmemtrace_type_id type_id; - unsigned long call_site; - const void *ptr; -}; +#include "trace_entries.h" +/* + * syscalls are special, and need special handling, this is why + * they are not included in trace_entries.h + */ struct syscall_trace_enter { struct trace_entry ent; int nr; @@ -228,14 +124,12 @@ struct kretprobe_trace_entry { (offsetof(struct kretprobe_trace_entry, args) + \ (sizeof(unsigned long) * (n))) - - /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: * IRQS_OFF - interrupts were disabled * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags - * NEED_RESCED - reschedule is requested + * NEED_RESCHED - reschedule is requested * HARDIRQ - inside an interrupt handler * SOFTIRQ - inside a softirq handler */ @@ -334,7 +228,6 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ - IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ @@ -414,7 +307,6 @@ struct tracer { struct tracer *next; int print_max; struct tracer_flags *flags; - struct tracer_stat *stats; }; @@ -493,6 +385,7 @@ void tracing_stop_sched_switch_record(void); void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); +int is_tracing_stopped(void); extern unsigned long nsecs_to_usecs(unsigned long nsecs); @@ -533,20 +426,6 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, extern cycle_t ftrace_now(int cpu); -#ifdef CONFIG_CONTEXT_SWITCH_TRACER -typedef void -(*tracer_switch_func_t)(void *private, - void *__rq, - struct task_struct *prev, - struct task_struct *next); - -struct tracer_switch_ops { - tracer_switch_func_t func; - void *private; - struct tracer_switch_ops *next; -}; -#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ - extern void trace_find_cmdline(int pid, char comm[]); #ifdef CONFIG_DYNAMIC_FTRACE @@ -662,6 +541,41 @@ static inline int ftrace_trace_task(struct task_struct *task) #endif /* + * struct trace_parser - servers for reading the user input separated by spaces + * @cont: set if the input is not complete - no final space char was found + * @buffer: holds the parsed user input + * @idx: user input lenght + * @size: buffer size + */ +struct trace_parser { + bool cont; + char *buffer; + unsigned idx; + unsigned size; +}; + +static inline bool trace_parser_loaded(struct trace_parser *parser) +{ + return (parser->idx != 0); +} + +static inline bool trace_parser_cont(struct trace_parser *parser) +{ + return parser->cont; +} + +static inline void trace_parser_clear(struct trace_parser *parser) +{ + parser->cont = false; + parser->idx = 0; +} + +extern int trace_parser_get_init(struct trace_parser *parser, int size); +extern void trace_parser_put(struct trace_parser *parser); +extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos); + +/* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. * @@ -847,58 +761,18 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } -#define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event, \ - int val1, int val2) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - int match = 0; \ - \ - switch (pred->op) { \ - case OP_LT: \ - match = (*addr < val); \ - break; \ - case OP_LE: \ - match = (*addr <= val); \ - break; \ - case OP_GT: \ - match = (*addr > val); \ - break; \ - case OP_GE: \ - match = (*addr >= val); \ - break; \ - default: \ - break; \ - } \ - \ - return match; \ -} - -#define DEFINE_EQUALITY_PRED(size) \ -static int filter_pred_##size(struct filter_pred *pred, void *event, \ - int val1, int val2) \ -{ \ - u##size *addr = (u##size *)(event + pred->offset); \ - u##size val = (u##size)pred->val; \ - int match; \ - \ - match = (val == *addr) ^ pred->not; \ - \ - return match; \ -} - extern struct mutex event_mutex; extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ extern struct ftrace_event_call event_##call; -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) -#include "trace_event_types.h" +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#include "trace_entries.h" #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 19bfc75..c21d5f3 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -129,6 +129,7 @@ struct tracer boot_tracer __read_mostly = void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { + struct ftrace_event_call *call = &event_boot_call; struct ring_buffer_event *event; struct ring_buffer *buffer; struct trace_boot_call *entry; @@ -150,13 +151,15 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) goto out; entry = ring_buffer_event_data(event); entry->boot_call = *bt; - trace_buffer_unlock_commit(buffer, event, 0, 0); + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, 0); out: preempt_enable(); } void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) { + struct ftrace_event_call *call = &event_boot_ret; struct ring_buffer_event *event; struct ring_buffer *buffer; struct trace_boot_ret *entry; @@ -175,7 +178,8 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) goto out; entry = ring_buffer_event_data(event); entry->boot_ret = *bt; - trace_buffer_unlock_commit(buffer, event, 0, 0); + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, 0); out: preempt_enable(); } diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index b588fd8..20c5f92 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -66,10 +66,14 @@ u64 notrace trace_clock(void) * Used by plugins that need globally coherent timestamps. */ -static u64 prev_trace_clock_time; - -static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +/* keep prev_time and lock in the same cacheline. */ +static struct { + u64 prev_time; + raw_spinlock_t lock; +} trace_clock_struct ____cacheline_aligned_in_smp = + { + .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, + }; u64 notrace trace_clock_global(void) { @@ -88,19 +92,19 @@ u64 notrace trace_clock_global(void) if (unlikely(in_nmi())) goto out; - __raw_spin_lock(&trace_clock_lock); + __raw_spin_lock(&trace_clock_struct.lock); /* * TODO: if this happens often then maybe we should reset - * my_scd->clock to prev_trace_clock_time+1, to make sure + * my_scd->clock to prev_time+1, to make sure * we start ticking with the local clock from now on? */ - if ((s64)(now - prev_trace_clock_time) < 0) - now = prev_trace_clock_time + 1; + if ((s64)(now - trace_clock_struct.prev_time) < 0) + now = trace_clock_struct.prev_time + 1; - prev_trace_clock_time = now; + trace_clock_struct.prev_time = now; - __raw_spin_unlock(&trace_clock_lock); + __raw_spin_unlock(&trace_clock_struct.lock); out: raw_local_irq_restore(flags); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h new file mode 100644 index 0000000..ead3d72 --- /dev/null +++ b/kernel/trace/trace_entries.h @@ -0,0 +1,366 @@ +/* + * This file defines the trace event structures that go into the ring + * buffer directly. They are created via macros so that changes for them + * appear in the format file. Using macros will automate this process. + * + * The macro used to create a ftrace data structure is: + * + * FTRACE_ENTRY( name, struct_name, id, structure, print ) + * + * @name: the name used the event name, as well as the name of + * the directory that holds the format file. + * + * @struct_name: the name of the structure that is created. + * + * @id: The event identifier that is used to detect what event + * this is from the ring buffer. + * + * @structure: the structure layout + * + * - __field( type, item ) + * This is equivalent to declaring + * type item; + * in the structure. + * - __array( type, item, size ) + * This is equivalent to declaring + * type item[size]; + * in the structure. + * + * * for structures within structures, the format of the internal + * structure is layed out. This allows the internal structure + * to be deciphered for the format file. Although these macros + * may become out of sync with the internal structure, they + * will create a compile error if it happens. Since the + * internel structures are just tracing helpers, this is not + * an issue. + * + * When an internal structure is used, it should use: + * + * __field_struct( type, item ) + * + * instead of __field. This will prevent it from being shown in + * the output file. The fields in the structure should use. + * + * __field_desc( type, container, item ) + * __array_desc( type, container, item, len ) + * + * type, item and len are the same as __field and __array, but + * container is added. This is the name of the item in + * __field_struct that this is describing. + * + * + * @print: the print format shown to users in the format file. + */ + +/* + * Function trace entry - function address and parent function addres: + */ +FTRACE_ENTRY(function, ftrace_entry, + + TRACE_FN, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned long, parent_ip ) + ), + + F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) +); + +/* Function call entry */ +FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, + + TRACE_GRAPH_ENT, + + F_STRUCT( + __field_struct( struct ftrace_graph_ent, graph_ent ) + __field_desc( unsigned long, graph_ent, func ) + __field_desc( int, graph_ent, depth ) + ), + + F_printk("--> %lx (%d)", __entry->func, __entry->depth) +); + +/* Function return entry */ +FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, + + TRACE_GRAPH_RET, + + F_STRUCT( + __field_struct( struct ftrace_graph_ret, ret ) + __field_desc( unsigned long, ret, func ) + __field_desc( unsigned long long, ret, calltime) + __field_desc( unsigned long long, ret, rettime ) + __field_desc( unsigned long, ret, overrun ) + __field_desc( int, ret, depth ) + ), + + F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", + __entry->func, __entry->depth, + __entry->calltime, __entry->rettime, + __entry->depth) +); + +/* + * Context switch trace entry - which task (and prio) we switched from/to: + * + * This is used for both wakeup and context switches. We only want + * to create one structure, but we need two outputs for it. + */ +#define FTRACE_CTX_FIELDS \ + __field( unsigned int, prev_pid ) \ + __field( unsigned char, prev_prio ) \ + __field( unsigned char, prev_state ) \ + __field( unsigned int, next_pid ) \ + __field( unsigned char, next_prio ) \ + __field( unsigned char, next_state ) \ + __field( unsigned int, next_cpu ) + +FTRACE_ENTRY(context_switch, ctx_switch_entry, + + TRACE_CTX, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu + ) +); + +/* + * FTRACE_ENTRY_DUP only creates the format file, it will not + * create another structure. + */ +FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, + + TRACE_WAKE, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu + ) +); + +/* + * Special (free-form) trace entry: + */ +FTRACE_ENTRY(special, special_entry, + + TRACE_SPECIAL, + + F_STRUCT( + __field( unsigned long, arg1 ) + __field( unsigned long, arg2 ) + __field( unsigned long, arg3 ) + ), + + F_printk("(%08lx) (%08lx) (%08lx)", + __entry->arg1, __entry->arg2, __entry->arg3) +); + +/* + * Stack-trace entry: + */ + +#define FTRACE_STACK_ENTRIES 8 + +FTRACE_ENTRY(kernel_stack, stack_entry, + + TRACE_STACK, + + F_STRUCT( + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + ), + + F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + __entry->caller[0], __entry->caller[1], __entry->caller[2], + __entry->caller[3], __entry->caller[4], __entry->caller[5], + __entry->caller[6], __entry->caller[7]) +); + +FTRACE_ENTRY(user_stack, userstack_entry, + + TRACE_USER_STACK, + + F_STRUCT( + __field( unsigned int, tgid ) + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + ), + + F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + __entry->caller[0], __entry->caller[1], __entry->caller[2], + __entry->caller[3], __entry->caller[4], __entry->caller[5], + __entry->caller[6], __entry->caller[7]) +); + +/* + * trace_printk entry: + */ +FTRACE_ENTRY(bprint, bprint_entry, + + TRACE_BPRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __field( const char *, fmt ) + __dynamic_array( u32, buf ) + ), + + F_printk("%08lx fmt:%p", + __entry->ip, __entry->fmt) +); + +FTRACE_ENTRY(print, print_entry, + + TRACE_PRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __dynamic_array( char, buf ) + ), + + F_printk("%08lx %s", + __entry->ip, __entry->buf) +); + +FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, + + TRACE_MMIO_RW, + + F_STRUCT( + __field_struct( struct mmiotrace_rw, rw ) + __field_desc( resource_size_t, rw, phys ) + __field_desc( unsigned long, rw, value ) + __field_desc( unsigned long, rw, pc ) + __field_desc( int, rw, map_id ) + __field_desc( unsigned char, rw, opcode ) + __field_desc( unsigned char, rw, width ) + ), + + F_printk("%lx %lx %lx %d %x %x", + (unsigned long)__entry->phys, __entry->value, __entry->pc, + __entry->map_id, __entry->opcode, __entry->width) +); + +FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, + + TRACE_MMIO_MAP, + + F_STRUCT( + __field_struct( struct mmiotrace_map, map ) + __field_desc( resource_size_t, map, phys ) + __field_desc( unsigned long, map, virt ) + __field_desc( unsigned long, map, len ) + __field_desc( int, map, map_id ) + __field_desc( unsigned char, map, opcode ) + ), + + F_printk("%lx %lx %lx %d %x", + (unsigned long)__entry->phys, __entry->virt, __entry->len, + __entry->map_id, __entry->opcode) +); + +FTRACE_ENTRY(boot_call, trace_boot_call, + + TRACE_BOOT_CALL, + + F_STRUCT( + __field_struct( struct boot_trace_call, boot_call ) + __field_desc( pid_t, boot_call, caller ) + __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN) + ), + + F_printk("%d %s", __entry->caller, __entry->func) +); + +FTRACE_ENTRY(boot_ret, trace_boot_ret, + + TRACE_BOOT_RET, + + F_STRUCT( + __field_struct( struct boot_trace_ret, boot_ret ) + __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN) + __field_desc( int, boot_ret, result ) + __field_desc( unsigned long, boot_ret, duration ) + ), + + F_printk("%s %d %lx", + __entry->func, __entry->result, __entry->duration) +); + +#define TRACE_FUNC_SIZE 30 +#define TRACE_FILE_SIZE 20 + +FTRACE_ENTRY(branch, trace_branch, + + TRACE_BRANCH, + + F_STRUCT( + __field( unsigned int, line ) + __array( char, func, TRACE_FUNC_SIZE+1 ) + __array( char, file, TRACE_FILE_SIZE+1 ) + __field( char, correct ) + ), + + F_printk("%u:%s:%s (%u)", + __entry->line, + __entry->func, __entry->file, __entry->correct) +); + +FTRACE_ENTRY(hw_branch, hw_branch_entry, + + TRACE_HW_BRANCHES, + + F_STRUCT( + __field( u64, from ) + __field( u64, to ) + ), + + F_printk("from: %llx to: %llx", __entry->from, __entry->to) +); + +FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, + + TRACE_KMEM_ALLOC, + + F_STRUCT( + __field( enum kmemtrace_type_id, type_id ) + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi" + " flags:%x node:%d", + __entry->type_id, __entry->call_site, __entry->ptr, + __entry->bytes_req, __entry->bytes_alloc, + __entry->gfp_flags, __entry->node) +); + +FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, + + TRACE_KMEM_FREE, + + F_STRUCT( + __field( enum kmemtrace_type_id, type_id ) + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + F_printk("type:%u call_site:%lx ptr:%p", + __entry->type_id, __entry->call_site, __entry->ptr) +); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 11ba5bb..e812f1c 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -5,8 +5,60 @@ * */ +#include <linux/module.h> #include "trace.h" +/* + * We can't use a size but a type in alloc_percpu() + * So let's create a dummy type that matches the desired size + */ +typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t; + +char *trace_profile_buf; +EXPORT_SYMBOL_GPL(trace_profile_buf); + +char *trace_profile_buf_nmi; +EXPORT_SYMBOL_GPL(trace_profile_buf_nmi); + +/* Count the events in use (per event id, not per instance) */ +static int total_profile_count; + +static int ftrace_profile_enable_event(struct ftrace_event_call *event) +{ + char *buf; + int ret = -ENOMEM; + + if (atomic_inc_return(&event->profile_count)) + return 0; + + if (!total_profile_count++) { + buf = (char *)alloc_percpu(profile_buf_t); + if (!buf) + goto fail_buf; + + rcu_assign_pointer(trace_profile_buf, buf); + + buf = (char *)alloc_percpu(profile_buf_t); + if (!buf) + goto fail_buf_nmi; + + rcu_assign_pointer(trace_profile_buf_nmi, buf); + } + + ret = event->profile_enable(event); + if (!ret) + return 0; + + kfree(trace_profile_buf_nmi); +fail_buf_nmi: + kfree(trace_profile_buf); +fail_buf: + total_profile_count--; + atomic_dec(&event->profile_count); + + return ret; +} + int ftrace_profile_enable(int event_id) { struct ftrace_event_call *event; @@ -14,8 +66,9 @@ int ftrace_profile_enable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id && event->profile_enable) { - ret = event->profile_enable(event); + if (event->id == event_id && event->profile_enable && + try_module_get(event->mod)) { + ret = ftrace_profile_enable_event(event); break; } } @@ -24,6 +77,33 @@ int ftrace_profile_enable(int event_id) return ret; } +static void ftrace_profile_disable_event(struct ftrace_event_call *event) +{ + char *buf, *nmi_buf; + + if (!atomic_add_negative(-1, &event->profile_count)) + return; + + event->profile_disable(event); + + if (!--total_profile_count) { + buf = trace_profile_buf; + rcu_assign_pointer(trace_profile_buf, NULL); + + nmi_buf = trace_profile_buf_nmi; + rcu_assign_pointer(trace_profile_buf_nmi, NULL); + + /* + * Ensure every events in profiling have finished before + * releasing the buffers + */ + synchronize_sched(); + + free_percpu(buf); + free_percpu(nmi_buf); + } +} + void ftrace_profile_disable(int event_id) { struct ftrace_event_call *event; @@ -31,7 +111,8 @@ void ftrace_profile_disable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id) { - event->profile_disable(event); + ftrace_profile_disable_event(event); + module_put(event->mod); break; } } diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h deleted file mode 100644 index e74f090..0000000 --- a/kernel/trace/trace_event_types.h +++ /dev/null @@ -1,178 +0,0 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM ftrace - -/* - * We cheat and use the proto type field as the ID - * and args as the entry type (minus 'struct') - */ -TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(unsigned long, parent_ip, parent_ip) - ), - TP_RAW_FMT(" %lx <-- %lx") -); - -TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT, - ftrace_graph_ent_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, graph_ent.func, func) - TRACE_FIELD(int, graph_ent.depth, depth) - ), - TP_RAW_FMT("--> %lx (%d)") -); - -TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET, - ftrace_graph_ret_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ret.func, func) - TRACE_FIELD(unsigned long long, ret.calltime, calltime) - TRACE_FIELD(unsigned long long, ret.rettime, rettime) - TRACE_FIELD(unsigned long, ret.overrun, overrun) - TRACE_FIELD(int, ret.depth, depth) - ), - TP_RAW_FMT("<-- %lx (%d)") -); - -TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, prev_pid, prev_pid) - TRACE_FIELD(unsigned char, prev_prio, prev_prio) - TRACE_FIELD(unsigned char, prev_state, prev_state) - TRACE_FIELD(unsigned int, next_pid, next_pid) - TRACE_FIELD(unsigned char, next_prio, next_prio) - TRACE_FIELD(unsigned char, next_state, next_state) - TRACE_FIELD(unsigned int, next_cpu, next_cpu) - ), - TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") -); - -TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, prev_pid, prev_pid) - TRACE_FIELD(unsigned char, prev_prio, prev_prio) - TRACE_FIELD(unsigned char, prev_state, prev_state) - TRACE_FIELD(unsigned int, next_pid, next_pid) - TRACE_FIELD(unsigned char, next_prio, next_prio) - TRACE_FIELD(unsigned char, next_state, next_state) - TRACE_FIELD(unsigned int, next_cpu, next_cpu) - ), - TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") -); - -TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, arg1, arg1) - TRACE_FIELD(unsigned long, arg2, arg2) - TRACE_FIELD(unsigned long, arg3, arg3) - ), - TP_RAW_FMT("(%08lx) (%08lx) (%08lx)") -); - -/* - * Stack-trace entry: - */ - -/* #define FTRACE_STACK_ENTRIES 8 */ - -TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, caller[0], stack0) - TRACE_FIELD(unsigned long, caller[1], stack1) - TRACE_FIELD(unsigned long, caller[2], stack2) - TRACE_FIELD(unsigned long, caller[3], stack3) - TRACE_FIELD(unsigned long, caller[4], stack4) - TRACE_FIELD(unsigned long, caller[5], stack5) - TRACE_FIELD(unsigned long, caller[6], stack6) - TRACE_FIELD(unsigned long, caller[7], stack7) - ), - TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") -); - -TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, caller[0], stack0) - TRACE_FIELD(unsigned long, caller[1], stack1) - TRACE_FIELD(unsigned long, caller[2], stack2) - TRACE_FIELD(unsigned long, caller[3], stack3) - TRACE_FIELD(unsigned long, caller[4], stack4) - TRACE_FIELD(unsigned long, caller[5], stack5) - TRACE_FIELD(unsigned long, caller[6], stack6) - TRACE_FIELD(unsigned long, caller[7], stack7) - ), - TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") -); - -TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(char *, fmt, fmt) - TRACE_FIELD_ZERO(char, buf) - ), - TP_RAW_FMT("%08lx (%d) fmt:%p %s") -); - -TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD_ZERO(char, buf) - ), - TP_RAW_FMT("%08lx (%d) fmt:%p %s") -); - -TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, line, line) - TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, - TRACE_FUNC_SIZE+1, func) - TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, - TRACE_FUNC_SIZE+1, file) - TRACE_FIELD(char, correct, correct) - ), - TP_RAW_FMT("%u:%s:%s (%u)") -); - -TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(u64, from, from) - TRACE_FIELD(u64, to, to) - ), - TP_RAW_FMT("from: %llx to: %llx") -); - -TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, - TRACE_STRUCT( - TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1) - TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1) - TRACE_FIELD(int, state_data.type, type) - TRACE_FIELD(int, state_data.state, state) - ), - TP_RAW_FMT("%llx->%llx type:%u state:%u") -); - -TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) - TRACE_FIELD(unsigned long, call_site, call_site) - TRACE_FIELD(const void *, ptr, ptr) - TRACE_FIELD(size_t, bytes_req, bytes_req) - TRACE_FIELD(size_t, bytes_alloc, bytes_alloc) - TRACE_FIELD(gfp_t, gfp_flags, gfp_flags) - TRACE_FIELD(int, node, node) - ), - TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu" - " flags:%x node:%d") -); - -TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) - TRACE_FIELD(unsigned long, call_site, call_site) - TRACE_FIELD(const void *, ptr, ptr) - ), - TP_RAW_FMT("type:%u call_site:%lx ptr:%p") -); - -#undef TRACE_SYSTEM diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f85b0f1..a4b7c9a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -21,6 +21,7 @@ #include "trace_output.h" +#undef TRACE_SYSTEM #define TRACE_SYSTEM "TRACE_SYSTEM" DEFINE_MUTEX(event_mutex); @@ -86,7 +87,7 @@ int trace_define_common_fields(struct ftrace_event_call *call) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(int, tgid); + __common_field(int, lock_depth); return ret; } @@ -226,11 +227,9 @@ static ssize_t ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_parser parser; size_t read = 0; - int i, set = 1; ssize_t ret; - char *buf; - char ch; if (!cnt || cnt < 0) return 0; @@ -239,60 +238,28 @@ ftrace_event_write(struct file *file, const char __user *ubuf, if (ret < 0) return ret; - ret = get_user(ch, ubuf++); - if (ret) - return ret; - read++; - cnt--; - - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - return ret; - read++; - cnt--; - } - - /* Only white space found? */ - if (isspace(ch)) { - file->f_pos += read; - ret = read; - return ret; - } - - buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL); - if (!buf) + if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) return -ENOMEM; - if (cnt > EVENT_BUF_SIZE) - cnt = EVENT_BUF_SIZE; + read = trace_get_user(&parser, ubuf, cnt, ppos); + + if (trace_parser_loaded((&parser))) { + int set = 1; - i = 0; - while (cnt && !isspace(ch)) { - if (!i && ch == '!') + if (*parser.buffer == '!') set = 0; - else - buf[i++] = ch; - ret = get_user(ch, ubuf++); + parser.buffer[parser.idx] = 0; + + ret = ftrace_set_clr_event(parser.buffer + !set, set); if (ret) - goto out_free; - read++; - cnt--; + goto out_put; } - buf[i] = 0; - - file->f_pos += read; - - ret = ftrace_set_clr_event(buf, set); - if (ret) - goto out_free; ret = read; - out_free: - kfree(buf); + out_put: + trace_parser_put(&parser); return ret; } @@ -300,42 +267,32 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct list_head *list = m->private; - struct ftrace_event_call *call; + struct ftrace_event_call *call = v; (*pos)++; - for (;;) { - if (list == &ftrace_events) - return NULL; - - call = list_entry(list, struct ftrace_event_call, list); - + list_for_each_entry_continue(call, &ftrace_events, list) { /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ if (call->regfunc) - break; - - list = list->next; + return call; } - m->private = list->next; - - return call; + return NULL; } static void *t_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call = NULL; + struct ftrace_event_call *call; loff_t l; mutex_lock(&event_mutex); - m->private = ftrace_events.next; + call = list_entry(&ftrace_events, struct ftrace_event_call, list); for (l = 0; l <= *pos; ) { - call = t_next(m, NULL, &l); + call = t_next(m, call, &l); if (!call) break; } @@ -345,37 +302,28 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct list_head *list = m->private; - struct ftrace_event_call *call; + struct ftrace_event_call *call = v; (*pos)++; - retry: - if (list == &ftrace_events) - return NULL; - - call = list_entry(list, struct ftrace_event_call, list); - - if (!call->enabled) { - list = list->next; - goto retry; + list_for_each_entry_continue(call, &ftrace_events, list) { + if (call->enabled) + return call; } - m->private = list->next; - - return call; + return NULL; } static void *s_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call = NULL; + struct ftrace_event_call *call; loff_t l; mutex_lock(&event_mutex); - m->private = ftrace_events.next; + call = list_entry(&ftrace_events, struct ftrace_event_call, list); for (l = 0; l <= *pos; ) { - call = s_next(m, NULL, &l); + call = s_next(m, call, &l); if (!call) break; } @@ -574,7 +522,7 @@ static int trace_write_header(struct trace_seq *s) FIELD(unsigned char, flags), FIELD(unsigned char, preempt_count), FIELD(int, pid), - FIELD(int, tgid)); + FIELD(int, lock_depth)); } static ssize_t @@ -1242,7 +1190,7 @@ static int trace_module_notify(struct notifier_block *self, } #endif /* CONFIG_MODULES */ -struct notifier_block trace_module_nb = { +static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 0, }; @@ -1414,6 +1362,18 @@ static __init void event_trace_self_tests(void) if (!call->regfunc) continue; +/* + * Testing syscall events here is pretty useless, but + * we still do it if configured. But this is time consuming. + * What we really need is a user thread to perform the + * syscalls as we test. + */ +#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS + if (call->system && + strcmp(call->system, "syscalls") == 0) + continue; +#endif + pr_info("Testing event %s: ", call->name); /* @@ -1487,7 +1447,7 @@ static __init void event_trace_self_tests(void) #ifdef CONFIG_FUNCTION_TRACER -static DEFINE_PER_CPU(atomic_t, test_event_disable); +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void function_test_events_call(unsigned long ip, unsigned long parent_ip) @@ -1504,7 +1464,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) pc = preempt_count(); resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); if (disabled != 1) goto out; @@ -1523,7 +1483,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); out: - atomic_dec(&per_cpu(test_event_disable, cpu)); + atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); ftrace_preempt_enable(resched); } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 93660fb..2324578 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -121,6 +121,47 @@ struct filter_parse_state { } operand; }; +#define DEFINE_COMPARISON_PRED(type) \ +static int filter_pred_##type(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = 0; \ + \ + switch (pred->op) { \ + case OP_LT: \ + match = (*addr < val); \ + break; \ + case OP_LE: \ + match = (*addr <= val); \ + break; \ + case OP_GT: \ + match = (*addr > val); \ + break; \ + case OP_GE: \ + match = (*addr >= val); \ + break; \ + default: \ + break; \ + } \ + \ + return match; \ +} + +#define DEFINE_EQUALITY_PRED(size) \ +static int filter_pred_##size(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + u##size val = (u##size)pred->val; \ + int match; \ + \ + match = (val == *addr) ^ pred->not; \ + \ + return match; \ +} + DEFINE_COMPARISON_PRED(s64); DEFINE_COMPARISON_PRED(u64); DEFINE_COMPARISON_PRED(s32); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index a79ef6f..ed7d480 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -15,147 +15,124 @@ #include "trace_output.h" +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ftrace -#undef TRACE_STRUCT -#define TRACE_STRUCT(args...) args +/* not needed for this file */ +#undef __field_struct +#define __field_struct(type, item) -extern void __bad_type_size(void); +#undef __field +#define __field(type, item) type item; -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign) \ - if (sizeof(type) != sizeof(field.item)) \ - __bad_type_size(); \ +#undef __field_desc +#define __field_desc(type, container, item) type item; + +#undef __array +#define __array(type, item, size) type item[size]; + +#undef __array_desc +#define __array_desc(type, container, item, size) type item[size]; + +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; + +#undef F_STRUCT +#define F_STRUCT(args...) args + +#undef F_printk +#define F_printk(fmt, args...) fmt, args + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +struct ____ftrace_##name { \ + tstruct \ +}; \ +static void __used ____ftrace_check_##name(void) \ +{ \ + struct ____ftrace_##name *__entry = NULL; \ + \ + /* force cmpile-time check on F_printk() */ \ + printk(print); \ +} + +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + +#include "trace_entries.h" + + +#undef __field +#define __field(type, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ + "offset:%zu;\tsize:%zu;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ if (!ret) \ return 0; +#undef __field_desc +#define __field_desc(type, container, item) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%zu;\tsize:%zu;\n", \ + offsetof(typeof(field), container.item), \ + sizeof(field.container.item)); \ + if (!ret) \ + return 0; -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ - ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ +#undef __array +#define __array(type, item, len) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ + "offset:%zu;\tsize:%zu;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ if (!ret) \ return 0; -#undef TRACE_FIELD_ZERO -#define TRACE_FIELD_ZERO(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:0;\n", \ - (unsigned int)offsetof(typeof(field), item)); \ +#undef __array_desc +#define __array_desc(type, container, item, len) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ + "offset:%zu;\tsize:%zu;\n", \ + offsetof(typeof(field), container.item), \ + sizeof(field.container.item)); \ if (!ret) \ return 0; -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - TRACE_FIELD(type, item, assign) +#undef __dynamic_array +#define __dynamic_array(type, item) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%zu;\tsize:0;\n", \ + offsetof(typeof(field), item)); \ + if (!ret) \ + return 0; -#undef TP_RAW_FMT -#define TP_RAW_FMT(args...) args +#undef F_printk +#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -static int \ -ftrace_format_##call(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - struct args field; \ - int ret; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ - \ - return ret; \ -} +#undef __entry +#define __entry REC -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) \ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ static int \ -ftrace_format_##call(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ +ftrace_format_##name(struct ftrace_event_call *unused, \ + struct trace_seq *s) \ { \ - struct args field; \ - int ret; \ + struct struct_name field __attribute__((unused)); \ + int ret = 0; \ \ tstruct; \ \ - trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ + trace_seq_printf(s, "\nprint fmt: " print); \ \ return ret; \ } -#include "trace_event_types.h" - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ - entry->item = assign; - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ - entry->item = assign; - -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - TRACE_FIELD(type, item, assign) - -#undef TRACE_FIELD_ZERO -#define TRACE_FIELD_ZERO(type, item) - -#undef TP_CMD -#define TP_CMD(cmd...) cmd - -#undef TRACE_ENTRY -#define TRACE_ENTRY entry - -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ - cmd; - -static int ftrace_raw_init_event(struct ftrace_event_call *event_call) -{ - INIT_LIST_HEAD(&event_call->fields); - - return 0; -} - -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -int ftrace_define_fields_##call(struct ftrace_event_call *event_call); \ - \ -struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .id = proto, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_raw_init_event, \ - .show_format = ftrace_format_##call, \ - .define_fields = ftrace_define_fields_##call, \ -}; \ - -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) \ - \ -struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .id = proto, \ - .system = __stringify(TRACE_SYSTEM), \ - .show_format = ftrace_format_##call, \ -}; - -#include "trace_event_types.h" +#include "trace_entries.h" -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign) \ +#undef __field +#define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ @@ -163,32 +140,45 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ if (ret) \ return ret; -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ +#undef __field_desc +#define __field_desc(type, container, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), \ + container.item), \ + sizeof(field.container.item), \ + is_signed_type(type), FILTER_OTHER); \ + if (ret) \ + return ret; + +#undef __array +#define __array(type, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ sizeof(field.item), 0, FILTER_OTHER); \ if (ret) \ return ret; -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed, \ +#undef __array_desc +#define __array_desc(type, container, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), \ + container.item), \ + sizeof(field.container.item), 0, \ FILTER_OTHER); \ if (ret) \ return ret; -#undef TRACE_FIELD_ZERO -#define TRACE_FIELD_ZERO(type, item) +#undef __dynamic_array +#define __dynamic_array(type, item) -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ int \ -ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ +ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ { \ - struct args field; \ + struct struct_name field; \ int ret; \ \ ret = trace_define_common_fields(event_call); \ @@ -200,8 +190,41 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ return ret; \ } -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) +#include "trace_entries.h" + +static int ftrace_raw_init_event(struct ftrace_event_call *call) +{ + INIT_LIST_HEAD(&call->fields); + return 0; +} + +#undef __field +#define __field(type, item) + +#undef __field_desc +#define __field_desc(type, container, item) + +#undef __array +#define __array(type, item, len) + +#undef __array_desc +#define __array_desc(type, container, item, len) + +#undef __dynamic_array +#define __dynamic_array(type, item) + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ + \ +struct ftrace_event_call __used \ +__attribute__((__aligned__(4))) \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .id = type, \ + .system = __stringify(TRACE_SYSTEM), \ + .raw_init = ftrace_raw_init_event, \ + .show_format = ftrace_format_##call, \ + .define_fields = ftrace_define_fields_##call, \ +}; \ -#include "trace_event_types.h" +#include "trace_entries.h" diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5b01b94..b3f3776 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -290,7 +290,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, { long count = (long)data; - seq_printf(m, "%pf:", (void *)ip); + seq_printf(m, "%ps:", (void *)ip); if (ops == &traceon_probe_ops) seq_printf(m, "traceon"); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b3749a2..45e6c01 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -124,7 +124,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, if (unlikely(current->ret_stack[index].fp != frame_pointer)) { ftrace_graph_stop(); WARN(1, "Bad frame pointer: expected %lx, received %lx\n" - " from func %pF return to %lx\n", + " from func %ps return to %lx\n", current->ret_stack[index].fp, frame_pointer, (void *)current->ret_stack[index].func, @@ -364,6 +364,15 @@ print_graph_proc(struct trace_seq *s, pid_t pid) } +static enum print_line_t +print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +{ + if (!trace_seq_putc(s, ' ')) + return 0; + + return trace_print_lat_fmt(s, entry); +} + /* If the pid changed since the last trace, output this event */ static enum print_line_t verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) @@ -521,6 +530,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; } + /* Proc */ if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { ret = print_graph_proc(s, pid); @@ -659,7 +669,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = trace_seq_printf(s, "%pf();\n", (void *)call->func); + ret = trace_seq_printf(s, "%ps();\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -702,7 +712,7 @@ print_graph_entry_nested(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func); + ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -758,6 +768,13 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } + /* Latency format */ + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + ret = print_graph_lat_fmt(s, ent); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + return 0; } @@ -952,28 +969,59 @@ print_graph_function(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; } +static void print_lat_header(struct seq_file *s) +{ + static const char spaces[] = " " /* 16 spaces */ + " " /* 4 spaces */ + " "; /* 17 spaces */ + int size = 0; + + if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) + size += 16; + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) + size += 4; + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) + size += 17; + + seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); + seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); + seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); + seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); + seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); + seq_printf(s, "#%.*s|||| / \n", size, spaces); +} + static void print_graph_headers(struct seq_file *s) { + int lat = trace_flags & TRACE_ITER_LATENCY_FMT; + + if (lat) + print_lat_header(s); + /* 1st line */ - seq_printf(s, "# "); + seq_printf(s, "#"); if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) seq_printf(s, " TIME "); if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, "CPU"); + seq_printf(s, " CPU"); if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " TASK/PID "); + seq_printf(s, " TASK/PID "); + if (lat) + seq_printf(s, "|||||"); if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " DURATION "); seq_printf(s, " FUNCTION CALLS\n"); /* 2nd line */ - seq_printf(s, "# "); + seq_printf(s, "#"); if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) seq_printf(s, " | "); if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, "| "); + seq_printf(s, " | "); if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " | | "); + seq_printf(s, " | | "); + if (lat) + seq_printf(s, "|||||"); if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " | | "); seq_printf(s, " | | | |\n"); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index ca7d7c4..23b6385 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -155,7 +155,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) seq_print_ip_sym(seq, it->from, symflags) && trace_seq_printf(seq, "\n")) return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE;; + return TRACE_TYPE_PARTIAL_LINE; } return TRACE_TYPE_UNHANDLED; } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5555b75..3aa7eaa 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -129,15 +129,10 @@ check_critical_timing(struct trace_array *tr, unsigned long parent_ip, int cpu) { - unsigned long latency, t0, t1; cycle_t T0, T1, delta; unsigned long flags; int pc; - /* - * usecs conversion is slow so we try to delay the conversion - * as long as possible: - */ T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); delta = T1-T0; @@ -157,18 +152,15 @@ check_critical_timing(struct trace_array *tr, trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); - latency = nsecs_to_usecs(delta); - if (data->critical_sequence != max_sequence) goto out_unlock; - tracing_max_latency = delta; - t0 = nsecs_to_usecs(T0); - t1 = nsecs_to_usecs(T1); - data->critical_end = parent_ip; - update_max_tr_single(tr, current, cpu); + if (likely(!is_tracing_stopped())) { + tracing_max_latency = delta; + update_max_tr_single(tr, current, cpu); + } max_sequence++; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f6821f1..09cba270 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -28,7 +28,7 @@ #include <linux/string.h> #include <linux/ctype.h> #include <linux/ptrace.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include "trace.h" #include "trace_output.h" @@ -1176,7 +1176,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, entry->ip = (unsigned long)kp->addr; for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); - perf_tpcounter_event(call->id, entry->ip, 1, entry, size); + perf_tp_event(call->id, entry->ip, 1, entry, size); } while (0); return 0; } @@ -1213,7 +1213,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, entry->ret_ip = (unsigned long)ri->ret_addr; for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); - perf_tpcounter_event(call->id, entry->ret_ip, 1, entry, size); + perf_tp_event(call->id, entry->ret_ip, 1, entry, size); } while (0); return 0; } @@ -1222,10 +1222,8 @@ static int probe_profile_enable(struct ftrace_event_call *call) { struct trace_probe *tp = (struct trace_probe *)call->data; - if (atomic_inc_return(&call->profile_count)) - return 0; - tp->flags |= TP_FLAG_PROFILE; + if (probe_is_return(tp)) return enable_kretprobe(&tp->rp); else @@ -1236,10 +1234,9 @@ static void probe_profile_disable(struct ftrace_event_call *call) { struct trace_probe *tp = (struct trace_probe *)call->data; - if (atomic_add_negative(-1, &call->profile_count)) - tp->flags &= ~TP_FLAG_PROFILE; + tp->flags &= ~TP_FLAG_PROFILE; - if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { + if (!(tp->flags & TP_FLAG_TRACE)) { if (probe_is_return(tp)) disable_kretprobe(&tp->rp); else diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index c4c9bbd..0acd834 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -307,6 +307,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_rw *rw) { + struct ftrace_event_call *call = &event_mmiotrace_rw; struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; @@ -320,7 +321,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, } entry = ring_buffer_event_data(event); entry->rw = *rw; - trace_buffer_unlock_commit(buffer, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -334,6 +337,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_map *map) { + struct ftrace_event_call *call = &event_mmiotrace_map; struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; @@ -347,7 +351,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr, } entry = ring_buffer_event_data(event); entry->map = *map; - trace_buffer_unlock_commit(buffer, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index e0c2545..f572f44 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -407,7 +407,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, * since individual threads might have already quit! */ rcu_read_lock(); - task = find_task_by_vpid(entry->ent.tgid); + task = find_task_by_vpid(entry->tgid); if (task) mm = get_task_mm(task); rcu_read_unlock(); @@ -460,18 +460,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) return ret; } -static int -lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +/** + * trace_print_lat_fmt - print the irq, preempt and lockdep fields + * @s: trace seq struct to write to + * @entry: The trace entry field from the ring buffer + * + * Prints the generic fields of irqs off, in hard or softirq, preempt + * count and lock depth. + */ +int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { int hardirq, softirq; - char comm[TASK_COMM_LEN]; + int ret; - trace_find_cmdline(entry->pid, comm); hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; - if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", - comm, entry->pid, cpu, + if (!trace_seq_printf(s, "%c%c%c", (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.', @@ -481,9 +486,30 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) hardirq ? 'h' : softirq ? 's' : '.')) return 0; + if (entry->lock_depth < 0) + ret = trace_seq_putc(s, '.'); + else + ret = trace_seq_printf(s, "%d", entry->lock_depth); + if (!ret) + return 0; + if (entry->preempt_count) return trace_seq_printf(s, "%x", entry->preempt_count); - return trace_seq_puts(s, "."); + return trace_seq_putc(s, '.'); +} + +static int +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +{ + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + + if (!trace_seq_printf(s, "%8.8s-%-5d %3d", + comm, entry->pid, cpu)) + return 0; + + return trace_print_lat_fmt(s, entry); } static unsigned long preempt_mark_thresh = 100; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index d38bec4..9d91c72 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -26,6 +26,8 @@ extern struct trace_event *ftrace_find_event(int type); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); +extern int +trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); /* used by module unregistering */ extern int __unregister_ftrace_event(struct trace_event *event); diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c deleted file mode 100644 index fe1a00f..0000000 --- a/kernel/trace/trace_power.c +++ /dev/null @@ -1,218 +0,0 @@ -/* - * ring buffer based C-state tracer - * - * Arjan van de Ven <arjan@linux.intel.com> - * Copyright (C) 2008 Intel Corporation - * - * Much is borrowed from trace_boot.c which is - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - * - */ - -#include <linux/init.h> -#include <linux/debugfs.h> -#include <trace/power.h> -#include <linux/kallsyms.h> -#include <linux/module.h> - -#include "trace.h" -#include "trace_output.h" - -static struct trace_array *power_trace; -static int __read_mostly trace_power_enabled; - -static void probe_power_start(struct power_trace *it, unsigned int type, - unsigned int level) -{ - if (!trace_power_enabled) - return; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); -} - - -static void probe_power_end(struct power_trace *it) -{ - struct ftrace_event_call *call = &event_power; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - buffer = tr->buffer; - - preempt_disable(); - it->end = ktime_get(); - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(buffer, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -static void probe_power_mark(struct power_trace *it, unsigned int type, - unsigned int level) -{ - struct ftrace_event_call *call = &event_power; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - buffer = tr->buffer; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); - preempt_disable(); - it->end = it->stamp; - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(buffer, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -static int tracing_power_register(void) -{ - int ret; - - ret = register_trace_power_start(probe_power_start); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_start\n"); - return ret; - } - ret = register_trace_power_end(probe_power_end); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_end\n"); - goto fail_start; - } - ret = register_trace_power_mark(probe_power_mark); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_mark\n"); - goto fail_end; - } - return ret; -fail_end: - unregister_trace_power_end(probe_power_end); -fail_start: - unregister_trace_power_start(probe_power_start); - return ret; -} - -static void start_power_trace(struct trace_array *tr) -{ - trace_power_enabled = 1; -} - -static void stop_power_trace(struct trace_array *tr) -{ - trace_power_enabled = 0; -} - -static void power_trace_reset(struct trace_array *tr) -{ - trace_power_enabled = 0; - unregister_trace_power_start(probe_power_start); - unregister_trace_power_end(probe_power_end); - unregister_trace_power_mark(probe_power_mark); -} - - -static int power_trace_init(struct trace_array *tr) -{ - power_trace = tr; - - trace_power_enabled = 1; - tracing_power_register(); - - tracing_reset_online_cpus(tr); - return 0; -} - -static enum print_line_t power_print_line(struct trace_iterator *iter) -{ - int ret = 0; - struct trace_entry *entry = iter->ent; - struct trace_power *field ; - struct power_trace *it; - struct trace_seq *s = &iter->seq; - struct timespec stamp; - struct timespec duration; - - trace_assign_type(field, entry); - it = &field->state_data; - stamp = ktime_to_timespec(it->stamp); - duration = ktime_to_timespec(ktime_sub(it->end, it->stamp)); - - if (entry->type == TRACE_POWER) { - if (it->type == POWER_CSTATE) - ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n", - stamp.tv_sec, - stamp.tv_nsec, - it->state, iter->cpu, - duration.tv_sec, - duration.tv_nsec); - if (it->type == POWER_PSTATE) - ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n", - stamp.tv_sec, - stamp.tv_nsec, - it->state, iter->cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; - } - return TRACE_TYPE_UNHANDLED; -} - -static void power_print_header(struct seq_file *s) -{ - seq_puts(s, "# TIMESTAMP STATE EVENT\n"); - seq_puts(s, "# | | |\n"); -} - -static struct tracer power_tracer __read_mostly = -{ - .name = "power", - .init = power_trace_init, - .start = start_power_trace, - .stop = stop_power_trace, - .reset = power_trace_reset, - .print_line = power_print_line, - .print_header = power_print_header, -}; - -static int init_power_trace(void) -{ - return register_tracer(&power_tracer); -} -device_initcall(init_power_trace); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 687699d..2547d88 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -11,7 +11,6 @@ #include <linux/ftrace.h> #include <linux/string.h> #include <linux/module.h> -#include <linux/marker.h> #include <linux/mutex.h> #include <linux/ctype.h> #include <linux/list.h> diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ad69f10..26185d7 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -24,6 +24,7 @@ static int __read_mostly tracer_enabled; static struct task_struct *wakeup_task; static int wakeup_cpu; +static int wakeup_current_cpu; static unsigned wakeup_prio = -1; static int wakeup_rt; @@ -56,33 +57,23 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); + if (cpu != wakeup_current_cpu) + goto out_enable; + data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); if (unlikely(disabled != 1)) goto out; local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); - - if (unlikely(!wakeup_task)) - goto unlock; - - /* - * The task can't disappear because it needs to - * wake up first, and we have the wakeup_lock. - */ - if (task_cpu(wakeup_task) != cpu) - goto unlock; trace_function(tr, ip, parent_ip, flags, pc); - unlock: - __raw_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: atomic_dec(&data->disabled); - + out_enable: ftrace_preempt_enable(resched); } @@ -107,11 +98,18 @@ static int report_latency(cycle_t delta) return 1; } +static void probe_wakeup_migrate_task(struct task_struct *task, int cpu) +{ + if (task != wakeup_task) + return; + + wakeup_current_cpu = cpu; +} + static void notrace probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - unsigned long latency = 0, t0 = 0, t1 = 0; struct trace_array_cpu *data; cycle_t T0, T1, delta; unsigned long flags; @@ -157,10 +155,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); - /* - * usecs conversion is slow so we try to delay the conversion - * as long as possible: - */ T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); delta = T1-T0; @@ -168,13 +162,10 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (!report_latency(delta)) goto out_unlock; - latency = nsecs_to_usecs(delta); - - tracing_max_latency = delta; - t0 = nsecs_to_usecs(T0); - t1 = nsecs_to_usecs(T1); - - update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + if (likely(!is_tracing_stopped())) { + tracing_max_latency = delta; + update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + } out_unlock: __wakeup_reset(wakeup_trace); @@ -244,6 +235,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) __wakeup_reset(wakeup_trace); wakeup_cpu = task_cpu(p); + wakeup_current_cpu = wakeup_cpu; wakeup_prio = p->prio; wakeup_task = p; @@ -293,6 +285,13 @@ static void start_wakeup_tracer(struct trace_array *tr) goto fail_deprobe_wake_new; } + ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_migrate_task\n"); + return; + } + wakeup_reset(tr); /* @@ -325,6 +324,7 @@ static void stop_wakeup_tracer(struct trace_array *tr) unregister_trace_sched_switch(probe_wakeup_sched_switch); unregister_trace_sched_wakeup_new(probe_wakeup); unregister_trace_sched_wakeup(probe_wakeup); + unregister_trace_sched_migrate_task(probe_wakeup_migrate_task); } static int __wakeup_tracer_init(struct trace_array *tr) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index dfc55fe..1b050ab 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -2,7 +2,7 @@ #include <trace/events/syscalls.h> #include <linux/kernel.h> #include <linux/ftrace.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/syscall.h> #include "trace_output.h" @@ -384,10 +384,13 @@ static int sys_prof_refcount_exit; static void prof_syscall_enter(struct pt_regs *regs, long id) { - struct syscall_trace_enter *rec; struct syscall_metadata *sys_data; + struct syscall_trace_enter *rec; + unsigned long flags; + char *raw_data; int syscall_nr; int size; + int cpu; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) @@ -402,20 +405,38 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - do { - char raw_data[size]; + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "profile buffer not large enough")) + return; + + /* Protect the per cpu buffer, begin the rcu read side */ + local_irq_save(flags); - /* zero the dead bytes from align to not leak stack to user */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + cpu = smp_processor_id(); + + if (in_nmi()) + raw_data = rcu_dereference(trace_profile_buf_nmi); + else + raw_data = rcu_dereference(trace_profile_buf); + + if (!raw_data) + goto end; - rec = (struct syscall_trace_enter *) raw_data; - tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->enter_id; - rec->nr = syscall_nr; - syscall_get_arguments(current, regs, 0, sys_data->nb_args, - (unsigned long *)&rec->args); - perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); - } while(0); + raw_data = per_cpu_ptr(raw_data, cpu); + + /* zero the dead bytes from align to not leak stack to user */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + + rec = (struct syscall_trace_enter *) raw_data; + tracing_generic_entry_update(&rec->ent, 0, 0); + rec->ent.type = sys_data->enter_id; + rec->nr = syscall_nr; + syscall_get_arguments(current, regs, 0, sys_data->nb_args, + (unsigned long *)&rec->args); + perf_tp_event(sys_data->enter_id, 0, 1, rec, size); + +end: + local_irq_restore(flags); } int reg_prof_syscall_enter(char *name) @@ -460,8 +481,12 @@ void unreg_prof_syscall_enter(char *name) static void prof_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; - struct syscall_trace_exit rec; + struct syscall_trace_exit *rec; + unsigned long flags; int syscall_nr; + char *raw_data; + int size; + int cpu; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) @@ -471,12 +496,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) if (!sys_data) return; - tracing_generic_entry_update(&rec.ent, 0, 0); - rec.ent.type = sys_data->exit_id; - rec.nr = syscall_nr; - rec.ret = syscall_get_return_value(current, regs); + /* We can probably do that at build time */ + size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); - perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); + /* + * Impossible, but be paranoid with the future + * How to put this check outside runtime? + */ + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "exit event has grown above profile buffer size")) + return; + + /* Protect the per cpu buffer, begin the rcu read side */ + local_irq_save(flags); + cpu = smp_processor_id(); + + if (in_nmi()) + raw_data = rcu_dereference(trace_profile_buf_nmi); + else + raw_data = rcu_dereference(trace_profile_buf); + + if (!raw_data) + goto end; + + raw_data = per_cpu_ptr(raw_data, cpu); + + /* zero the dead bytes from align to not leak stack to user */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + + rec = (struct syscall_trace_exit *)raw_data; + + tracing_generic_entry_update(&rec->ent, 0, 0); + rec->ent.type = sys_data->exit_id; + rec->nr = syscall_nr; + rec->ret = syscall_get_return_value(current, regs); + + perf_tp_event(sys_data->exit_id, 0, 1, rec, size); + +end: + local_irq_restore(flags); } int reg_prof_syscall_exit(char *name) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 9489a0a..cc89be5 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -48,7 +48,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; /* * Note about RCU : - * It is used to to delay the free of multiple probes array until a quiescent + * It is used to delay the free of multiple probes array until a quiescent * state is reached. * Tracepoint entries modifications are protected by the tracepoints_mutex. */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0668795..addfe2d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -317,8 +317,6 @@ static int worker_thread(void *__cwq) if (cwq->wq->freezeable) set_freezable(); - set_user_nice(current, -5); - for (;;) { prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); if (!freezing(current) && @@ -600,7 +598,12 @@ static struct workqueue_struct *keventd_wq __read_mostly; * schedule_work - put work task in global workqueue * @work: job to be done * - * This puts a job in the kernel-global workqueue. + * Returns zero if @work was already on the kernel-global workqueue and + * non-zero otherwise. + * + * This puts a job in the kernel-global workqueue if it was not already + * queued and leaves it in the same position on the kernel-global + * workqueue otherwise. */ int schedule_work(struct work_struct *work) { |