diff options
author | Thomas Gleixner <tglx@linutronix.de> | 2010-05-10 11:59:37 +0200 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2010-05-10 14:20:42 +0200 |
commit | dbb6be6d5e974c42bbecd183effaa0df69e1dd8b (patch) | |
tree | 5735cb47e70853d057a9881dd0ce44b83e88fa63 /kernel | |
parent | 6a867a395558a7f882d041783e4cdea6744ca2bf (diff) | |
parent | b57f95a38233a2e73b679bea4a5453a1cc2a1cc9 (diff) | |
download | op-kernel-dev-dbb6be6d5e974c42bbecd183effaa0df69e1dd8b.zip op-kernel-dev-dbb6be6d5e974c42bbecd183effaa0df69e1dd8b.tar.gz |
Merge branch 'linus' into timers/core
Reason: Further posix_cpu_timer patches depend on mainline changes
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel')
77 files changed, 755 insertions, 413 deletions
diff --git a/kernel/async.c b/kernel/async.c index 27235f5..15319d6 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -56,6 +56,7 @@ asynchronous and synchronous parts of the kernel. #include <linux/init.h> #include <linux/kthread.h> #include <linux/delay.h> +#include <linux/slab.h> #include <asm/atomic.h> static async_cookie_t next_cookie = 1; diff --git a/kernel/audit.c b/kernel/audit.c index 78f7f86..c71bd26 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -46,6 +46,7 @@ #include <asm/atomic.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/err.h> #include <linux/kthread.h> diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 028e856..46a57b5 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -3,6 +3,7 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/kthread.h> +#include <linux/slab.h> struct audit_tree; struct audit_chunk; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index cc7e879..8df4369 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -27,6 +27,7 @@ #include <linux/namei.h> #include <linux/netlink.h> #include <linux/sched.h> +#include <linux/slab.h> #include <linux/inotify.h> #include <linux/security.h> #include "audit.h" diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a706040..ce08041 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -27,6 +27,7 @@ #include <linux/namei.h> #include <linux/netlink.h> #include <linux/sched.h> +#include <linux/slab.h> #include <linux/security.h> #include "audit.h" diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f3a461c..3828ad5 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -49,6 +49,7 @@ #include <linux/namei.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/mount.h> #include <linux/socket.h> #include <linux/mqueue.h> @@ -1893,7 +1894,7 @@ static int audit_inc_name_count(struct audit_context *context, { if (context->name_count >= AUDIT_NAMES) { if (inode) - printk(KERN_DEBUG "name_count maxed, losing inode data: " + printk(KERN_DEBUG "audit: name_count maxed, losing inode data: " "dev=%02x:%02x, inode=%lu\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ef909a3..3a53c77 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -27,7 +27,6 @@ */ #include <linux/cgroup.h> -#include <linux/module.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/fs.h> @@ -1647,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry) int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { char *start; - struct dentry *dentry = rcu_dereference(cgrp->dentry); + struct dentry *dentry = rcu_dereference_check(cgrp->dentry, + rcu_read_lock_held() || + cgroup_lock_is_held()); if (!dentry || cgrp == dummytop) { /* @@ -1663,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) *--start = '\0'; for (;;) { int len = dentry->d_name.len; + if ((start -= len) < buf) return -ENAMETOOLONG; - memcpy(start, cgrp->dentry->d_name.name, len); + memcpy(start, dentry->d_name.name, len); cgrp = cgrp->parent; if (!cgrp) break; - dentry = rcu_dereference(cgrp->dentry); + + dentry = rcu_dereference_check(cgrp->dentry, + rcu_read_lock_held() || + cgroup_lock_is_held()); if (!cgrp->parent) continue; if (--start < buf) @@ -4556,13 +4561,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, { int subsys_id, i, depth = 0; struct cgroup_subsys_state *parent_css, *child_css; - struct css_id *child_id, *parent_id = NULL; + struct css_id *child_id, *parent_id; subsys_id = ss->subsys_id; parent_css = parent->subsys[subsys_id]; child_css = child->subsys[subsys_id]; - depth = css_depth(parent_css) + 1; parent_id = parent_css->id; + depth = parent_id->depth; child_id = get_new_cssid(ss, depth); if (IS_ERR(child_id)) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 59e9ef6..e5c0244 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -15,6 +15,7 @@ */ #include <linux/module.h> +#include <linux/slab.h> #include <linux/cgroup.h> #include <linux/fs.h> #include <linux/uaccess.h> @@ -47,17 +48,20 @@ static inline struct freezer *task_freezer(struct task_struct *task) struct freezer, css); } -int cgroup_frozen(struct task_struct *task) +int cgroup_freezing_or_frozen(struct task_struct *task) { struct freezer *freezer; enum freezer_state state; task_lock(task); freezer = task_freezer(task); - state = freezer->state; + if (!freezer->css.cgroup->parent) + state = CGROUP_THAWED; /* root cgroup can't be frozen */ + else + state = freezer->state; task_unlock(task); - return state == CGROUP_FROZEN; + return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); } /* @@ -201,9 +205,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) * No lock is needed, since the task isn't on tasklist yet, * so it can't be moved to another cgroup, which means the * freezer won't be removed and will be valid during this - * function call. + * function call. Nevertheless, apply RCU read-side critical + * section to suppress RCU lockdep false positives. */ + rcu_read_lock(); freezer = task_freezer(task); + rcu_read_unlock(); /* * The root cgroup is non-freezable, so we can skip the diff --git a/kernel/compat.c b/kernel/compat.c index f6c204f0..7f40e92 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -25,6 +25,7 @@ #include <linux/posix-timers.h> #include <linux/times.h> #include <linux/ptrace.h> +#include <linux/gfp.h> #include <asm/uaccess.h> diff --git a/kernel/cpu.c b/kernel/cpu.c index f8cced2..25bba73 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -14,6 +14,7 @@ #include <linux/kthread.h> #include <linux/stop_machine.h> #include <linux/mutex.h> +#include <linux/gfp.h> #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ba401fa..d109467 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * call to guarantee_online_mems(), as we know no one is changing * our task's cpuset. * - * Hold callback_mutex around the two modifications of our tasks - * mems_allowed to synchronize with cpuset_mems_allowed(). - * * While the mm_struct we are migrating is typically from some * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that @@ -973,15 +970,20 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; - nodemask_t newmems; + NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); + + if (!newmems) + return; cs = cgroup_cs(scan->cg); - guarantee_online_mems(cs, &newmems); + guarantee_online_mems(cs, newmems); task_lock(p); - cpuset_change_task_nodemask(p, &newmems); + cpuset_change_task_nodemask(p, newmems); task_unlock(p); + NODEMASK_FREE(newmems); + mm = get_task_mm(p); if (!mm) return; @@ -1051,16 +1053,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { - nodemask_t oldmem; + NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL); int retval; struct ptr_heap heap; + if (!oldmem) + return -ENOMEM; + /* * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; * it's read-only */ - if (cs == &top_cpuset) - return -EACCES; + if (cs == &top_cpuset) { + retval = -EACCES; + goto done; + } /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. @@ -1076,11 +1083,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, goto done; if (!nodes_subset(trialcs->mems_allowed, - node_states[N_HIGH_MEMORY])) - return -EINVAL; + node_states[N_HIGH_MEMORY])) { + retval = -EINVAL; + goto done; + } } - oldmem = cs->mems_allowed; - if (nodes_equal(oldmem, trialcs->mems_allowed)) { + *oldmem = cs->mems_allowed; + if (nodes_equal(*oldmem, trialcs->mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } @@ -1096,10 +1105,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, cs->mems_allowed = trialcs->mems_allowed; mutex_unlock(&callback_mutex); - update_tasks_nodemask(cs, &oldmem, &heap); + update_tasks_nodemask(cs, oldmem, &heap); heap_free(&heap); done: + NODEMASK_FREE(oldmem); return retval; } @@ -1384,40 +1394,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *oldcont, struct task_struct *tsk, bool threadgroup) { - nodemask_t from, to; struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); + NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); + NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); + + if (from == NULL || to == NULL) + goto alloc_fail; if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); - to = node_possible_map; } else { guarantee_online_cpus(cs, cpus_attach); - guarantee_online_mems(cs, &to); } + guarantee_online_mems(cs, to); /* do per-task migration stuff possibly for each in the threadgroup */ - cpuset_attach_task(tsk, &to, cs); + cpuset_attach_task(tsk, to, cs); if (threadgroup) { struct task_struct *c; rcu_read_lock(); list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - cpuset_attach_task(c, &to, cs); + cpuset_attach_task(c, to, cs); } rcu_read_unlock(); } /* change mm; only needs to be done once even if threadgroup */ - from = oldcs->mems_allowed; - to = cs->mems_allowed; + *from = oldcs->mems_allowed; + *to = cs->mems_allowed; mm = get_task_mm(tsk); if (mm) { - mpol_rebind_mm(mm, &to); + mpol_rebind_mm(mm, to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &from, &to); + cpuset_migrate_mm(mm, from, to); mmput(mm); } + +alloc_fail: + NODEMASK_FREE(from); + NODEMASK_FREE(to); } /* The various types of files and directories in a cpuset file system */ @@ -1562,13 +1579,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) { - nodemask_t mask; + NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); + int retval; + + if (mask == NULL) + return -ENOMEM; mutex_lock(&callback_mutex); - mask = cs->mems_allowed; + *mask = cs->mems_allowed; mutex_unlock(&callback_mutex); - return nodelist_scnprintf(page, PAGE_SIZE, mask); + retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); + + NODEMASK_FREE(mask); + + return retval; } static ssize_t cpuset_common_file_read(struct cgroup *cont, @@ -1997,7 +2022,10 @@ static void scan_for_empty_cpusets(struct cpuset *root) struct cpuset *cp; /* scans cpusets being updated */ struct cpuset *child; /* scans child cpusets of cp */ struct cgroup *cont; - nodemask_t oldmems; + NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); + + if (oldmems == NULL) + return; list_add_tail((struct list_head *)&root->stack_list, &queue); @@ -2014,7 +2042,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; - oldmems = cp->mems_allowed; + *oldmems = cp->mems_allowed; /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); @@ -2030,9 +2058,10 @@ static void scan_for_empty_cpusets(struct cpuset *root) remove_tasks_in_empty_cpuset(cp); else { update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, &oldmems, NULL); + update_tasks_nodemask(cp, oldmems, NULL); } } + NODEMASK_FREE(oldmems); } /* @@ -2090,20 +2119,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { + NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); + + if (oldmems == NULL) + return NOTIFY_DONE; + cgroup_lock(); switch (action) { case MEM_ONLINE: - case MEM_OFFLINE: + *oldmems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; mutex_unlock(&callback_mutex); - if (action == MEM_OFFLINE) - scan_for_empty_cpusets(&top_cpuset); + update_tasks_nodemask(&top_cpuset, oldmems, NULL); + break; + case MEM_OFFLINE: + /* + * needn't update top_cpuset.mems_allowed explicitly because + * scan_for_empty_cpusets() will update it. + */ + scan_for_empty_cpusets(&top_cpuset); break; default: break; } cgroup_unlock(); + + NODEMASK_FREE(oldmems); return NOTIFY_OK; } #endif diff --git a/kernel/cred.c b/kernel/cred.c index 1ed8ca1..62af181 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -10,6 +10,7 @@ */ #include <linux/module.h> #include <linux/cred.h> +#include <linux/slab.h> #include <linux/sched.h> #include <linux/key.h> #include <linux/keyctl.h> @@ -364,7 +365,7 @@ struct cred *prepare_usermodehelper_creds(void) new = kmem_cache_alloc(cred_jar, GFP_ATOMIC); if (!new) - return NULL; + goto free_tgcred; kdebug("prepare_usermodehelper_creds() alloc %p", new); @@ -398,6 +399,12 @@ struct cred *prepare_usermodehelper_creds(void) error: put_cred(new); return NULL; + +free_tgcred: +#ifdef CONFIG_KEYS + kfree(tgcred); +#endif + return NULL; } /* @@ -786,8 +793,6 @@ bool creds_are_invalid(const struct cred *cred) { if (cred->magic != CRED_MAGIC) return true; - if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers)) - return true; #ifdef CONFIG_SECURITY_SELINUX if (selinux_is_enabled()) { if ((unsigned long) cred->security < PAGE_SIZE) diff --git a/kernel/early_res.c b/kernel/early_res.c index 3cb2c66..31aa933 100644 --- a/kernel/early_res.c +++ b/kernel/early_res.c @@ -333,6 +333,12 @@ void __init free_early_partial(u64 start, u64 end) struct early_res *r; int i; + if (start == end) + return; + + if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end)) + return; + try_next: i = find_overlapped_early(start, end); if (i >= max_early_res) diff --git a/kernel/exit.c b/kernel/exit.c index cce59cb..7f2683a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -953,7 +953,8 @@ NORET_TYPE void do_exit(long code) acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ - sync_mm_rss(tsk, tsk->mm); + if (tsk->mm) + sync_mm_rss(tsk, tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); diff --git a/kernel/fork.c b/kernel/fork.c index 4799c5f..44b0791 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1052,6 +1052,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->prev_utime = cputime_zero; p->prev_stime = cputime_zero; #endif +#if defined(SPLIT_RSS_COUNTING) + memset(&p->rss_stat, 0, sizeof(p->rss_stat)); +#endif p->default_timer_slack_ns = current->timer_slack_ns; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 42ec11b..b7091d5 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -359,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) if (desc->chip->ack) desc->chip->ack(irq); } + desc->status |= IRQ_MASKED; +} + +static inline void mask_irq(struct irq_desc *desc, int irq) +{ + if (desc->chip->mask) { + desc->chip->mask(irq); + desc->status |= IRQ_MASKED; + } +} + +static inline void unmask_irq(struct irq_desc *desc, int irq) +{ + if (desc->chip->unmask) { + desc->chip->unmask(irq); + desc->status &= ~IRQ_MASKED; + } } /* @@ -484,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; - if (unlikely(desc->status & IRQ_ONESHOT)) - desc->status |= IRQ_MASKED; - else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) - desc->chip->unmask(irq); + if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) + unmask_irq(desc, irq); out_unlock: raw_spin_unlock(&desc->lock); } @@ -524,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) { desc->status |= IRQ_PENDING; - if (desc->chip->mask) - desc->chip->mask(irq); + mask_irq(desc, irq); goto out; } @@ -593,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) irqreturn_t action_ret; if (unlikely(!action)) { - desc->chip->mask(irq); + mask_irq(desc, irq); goto out_unlock; } @@ -605,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely((desc->status & (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == (IRQ_PENDING | IRQ_MASKED))) { - desc->chip->unmask(irq); - desc->status &= ~IRQ_MASKED; + unmask_irq(desc, irq); } desc->status &= ~IRQ_PENDING; @@ -716,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, __set_irq_handler(irq, handle, 0, name); } -void __init set_irq_noprobe(unsigned int irq) +void set_irq_noprobe(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -731,7 +744,7 @@ void __init set_irq_noprobe(unsigned int irq) raw_spin_unlock_irqrestore(&desc->lock, flags); } -void __init set_irq_probe(unsigned int irq) +void set_irq_probe(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index eb6078c..704e488 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -382,6 +382,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; + unsigned long flags; if (!desc) return 0; @@ -389,11 +390,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) if (desc->status & IRQ_NOREQUEST) return 0; + raw_spin_lock_irqsave(&desc->lock, flags); action = desc->action; if (action) if (irqflags & action->flags & IRQF_SHARED) action = NULL; + raw_spin_unlock_irqrestore(&desc->lock, flags); + return !action; } @@ -483,8 +487,26 @@ static int irq_wait_for_interrupt(struct irqaction *action) */ static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) { +again: chip_bus_lock(irq, desc); raw_spin_lock_irq(&desc->lock); + + /* + * Implausible though it may be we need to protect us against + * the following scenario: + * + * The thread is faster done than the hard interrupt handler + * on the other CPU. If we unmask the irq line then the + * interrupt can come in again and masks the line, leaves due + * to IRQ_INPROGRESS and the irq line is masked forever. + */ + if (unlikely(desc->status & IRQ_INPROGRESS)) { + raw_spin_unlock_irq(&desc->lock); + chip_bus_sync_unlock(irq, desc); + cpu_relax(); + goto again; + } + if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { desc->status &= ~IRQ_MASKED; desc->chip->unmask(irq); @@ -735,6 +757,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (new->flags & IRQF_ONESHOT) desc->status |= IRQ_ONESHOT; + /* + * Force MSI interrupts to run with interrupts + * disabled. The multi vector cards can cause stack + * overflows due to nested interrupts when enough of + * them are directed to a core and fire at the same + * time. + */ + if (desc->msi_desc) + new->flags |= IRQF_DISABLED; + if (!(desc->status & IRQ_NOAUTOEN)) { desc->depth = 0; desc->status &= ~IRQ_DISABLED; diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 963559d..65d3845 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -6,6 +6,7 @@ */ #include <linux/irq.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6f50ecc..7a6eb04 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -7,6 +7,7 @@ */ #include <linux/irq.h> +#include <linux/gfp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/interrupt.h> diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 8e5288a..13aff29 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -21,6 +21,7 @@ #include <linux/sched.h> /* for cond_resched */ #include <linux/mm.h> #include <linux/ctype.h> +#include <linux/slab.h> #include <asm/sections.h> diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 761fdd2..11f3515 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -69,9 +69,16 @@ struct kgdb_state { struct pt_regs *linux_regs; }; +/* Exception state values */ +#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */ +#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */ +#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */ +#define DCPU_SSTEP 0x8 /* CPU is single stepping */ + static struct debuggerinfo_struct { void *debuggerinfo; struct task_struct *task; + int exception_state; } kgdb_info[NR_CPUS]; /** @@ -391,27 +398,22 @@ int kgdb_mem2hex(char *mem, char *buf, int count) /* * Copy the binary array pointed to by buf into mem. Fix $, #, and - * 0x7d escaped with 0x7d. Return a pointer to the character after - * the last byte written. + * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success. + * The input buf is overwitten with the result to write to mem. */ static int kgdb_ebin2mem(char *buf, char *mem, int count) { - int err = 0; - char c; + int size = 0; + char *c = buf; while (count-- > 0) { - c = *buf++; - if (c == 0x7d) - c = *buf++ ^ 0x20; - - err = probe_kernel_write(mem, &c, 1); - if (err) - break; - - mem++; + c[size] = *buf++; + if (c[size] == 0x7d) + c[size] = *buf++ ^ 0x20; + size++; } - return err; + return probe_kernel_write(mem, c, size); } /* @@ -563,49 +565,6 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid) } /* - * CPU debug state control: - */ - -#ifdef CONFIG_SMP -static void kgdb_wait(struct pt_regs *regs) -{ - unsigned long flags; - int cpu; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - kgdb_info[cpu].debuggerinfo = regs; - kgdb_info[cpu].task = current; - /* - * Make sure the above info reaches the primary CPU before - * our cpu_in_kgdb[] flag setting does: - */ - smp_wmb(); - atomic_set(&cpu_in_kgdb[cpu], 1); - - /* Disable any cpu specific hw breakpoints */ - kgdb_disable_hw_debug(regs); - - /* Wait till primary CPU is done with debugging */ - while (atomic_read(&passive_cpu_wait[cpu])) - cpu_relax(); - - kgdb_info[cpu].debuggerinfo = NULL; - kgdb_info[cpu].task = NULL; - - /* fix up hardware debug registers on local cpu */ - if (arch_kgdb_ops.correct_hw_break) - arch_kgdb_ops.correct_hw_break(); - - /* Signal the primary CPU that we are done: */ - atomic_set(&cpu_in_kgdb[cpu], 0); - touch_softlockup_watchdog_sync(); - clocksource_touch_watchdog(); - local_irq_restore(flags); -} -#endif - -/* * Some architectures need cache flushes when we set/clear a * breakpoint: */ @@ -1400,34 +1359,13 @@ static int kgdb_reenter_check(struct kgdb_state *ks) return 1; } -/* - * kgdb_handle_exception() - main entry point from a kernel exception - * - * Locking hierarchy: - * interface locks, if any (begin_session) - * kgdb lock (kgdb_active) - */ -int -kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) +static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) { - struct kgdb_state kgdb_var; - struct kgdb_state *ks = &kgdb_var; unsigned long flags; int sstep_tries = 100; int error = 0; int i, cpu; - - ks->cpu = raw_smp_processor_id(); - ks->ex_vector = evector; - ks->signo = signo; - ks->ex_vector = evector; - ks->err_code = ecode; - ks->kgdb_usethreadid = 0; - ks->linux_regs = regs; - - if (kgdb_reenter_check(ks)) - return 0; /* Ouch, double exception ! */ - + int trace_on = 0; acquirelock: /* * Interrupts will be restored by the 'trap return' code, except when @@ -1435,13 +1373,43 @@ acquirelock: */ local_irq_save(flags); - cpu = raw_smp_processor_id(); + cpu = ks->cpu; + kgdb_info[cpu].debuggerinfo = regs; + kgdb_info[cpu].task = current; + /* + * Make sure the above info reaches the primary CPU before + * our cpu_in_kgdb[] flag setting does: + */ + atomic_inc(&cpu_in_kgdb[cpu]); /* - * Acquire the kgdb_active lock: + * CPU will loop if it is a slave or request to become a kgdb + * master cpu and acquire the kgdb_active lock: */ - while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1) + while (1) { + if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { + if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu) + break; + } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { + if (!atomic_read(&passive_cpu_wait[cpu])) + goto return_normal; + } else { +return_normal: + /* Return to normal operation by executing any + * hw breakpoint fixup. + */ + if (arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); + if (trace_on) + tracing_on(); + atomic_dec(&cpu_in_kgdb[cpu]); + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); + local_irq_restore(flags); + return 0; + } cpu_relax(); + } /* * For single stepping, try to only enter on the processor @@ -1475,9 +1443,6 @@ acquirelock: if (kgdb_io_ops->pre_exception) kgdb_io_ops->pre_exception(); - kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs; - kgdb_info[ks->cpu].task = current; - kgdb_disable_hw_debug(ks->linux_regs); /* @@ -1486,15 +1451,9 @@ acquirelock: */ if (!kgdb_single_step) { for (i = 0; i < NR_CPUS; i++) - atomic_set(&passive_cpu_wait[i], 1); + atomic_inc(&passive_cpu_wait[i]); } - /* - * spin_lock code is good enough as a barrier so we don't - * need one here: - */ - atomic_set(&cpu_in_kgdb[ks->cpu], 1); - #ifdef CONFIG_SMP /* Signal the other CPUs to enter kgdb_wait() */ if ((!kgdb_single_step) && kgdb_do_roundup) @@ -1518,6 +1477,9 @@ acquirelock: kgdb_single_step = 0; kgdb_contthread = current; exception_level = 0; + trace_on = tracing_is_on(); + if (trace_on) + tracing_off(); /* Talk to debugger with gdbserial protocol */ error = gdb_serial_stub(ks); @@ -1526,13 +1488,11 @@ acquirelock: if (kgdb_io_ops->post_exception) kgdb_io_ops->post_exception(); - kgdb_info[ks->cpu].debuggerinfo = NULL; - kgdb_info[ks->cpu].task = NULL; - atomic_set(&cpu_in_kgdb[ks->cpu], 0); + atomic_dec(&cpu_in_kgdb[ks->cpu]); if (!kgdb_single_step) { for (i = NR_CPUS-1; i >= 0; i--) - atomic_set(&passive_cpu_wait[i], 0); + atomic_dec(&passive_cpu_wait[i]); /* * Wait till all the CPUs have quit * from the debugger. @@ -1551,6 +1511,8 @@ kgdb_restore: else kgdb_sstep_pid = 0; } + if (trace_on) + tracing_on(); /* Free kgdb_active */ atomic_set(&kgdb_active, -1); touch_softlockup_watchdog_sync(); @@ -1560,13 +1522,52 @@ kgdb_restore: return error; } +/* + * kgdb_handle_exception() - main entry point from a kernel exception + * + * Locking hierarchy: + * interface locks, if any (begin_session) + * kgdb lock (kgdb_active) + */ +int +kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) +{ + struct kgdb_state kgdb_var; + struct kgdb_state *ks = &kgdb_var; + int ret; + + ks->cpu = raw_smp_processor_id(); + ks->ex_vector = evector; + ks->signo = signo; + ks->ex_vector = evector; + ks->err_code = ecode; + ks->kgdb_usethreadid = 0; + ks->linux_regs = regs; + + if (kgdb_reenter_check(ks)) + return 0; /* Ouch, double exception ! */ + kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER; + ret = kgdb_cpu_enter(ks, regs); + kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER; + return ret; +} + int kgdb_nmicallback(int cpu, void *regs) { #ifdef CONFIG_SMP + struct kgdb_state kgdb_var; + struct kgdb_state *ks = &kgdb_var; + + memset(ks, 0, sizeof(struct kgdb_state)); + ks->cpu = cpu; + ks->linux_regs = regs; + if (!atomic_read(&cpu_in_kgdb[cpu]) && - atomic_read(&kgdb_active) != cpu && - atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) { - kgdb_wait((struct pt_regs *)regs); + atomic_read(&kgdb_active) != -1 && + atomic_read(&kgdb_active) != cpu) { + kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE; + kgdb_cpu_enter(ks, regs); + kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE; return 0; } #endif @@ -1742,11 +1743,11 @@ EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); */ void kgdb_breakpoint(void) { - atomic_set(&kgdb_setting_breakpoint, 1); + atomic_inc(&kgdb_setting_breakpoint); wmb(); /* Sync point before breakpoint */ arch_kgdb_breakpoint(); wmb(); /* Sync point after breakpoint */ - atomic_set(&kgdb_setting_breakpoint, 0); + atomic_dec(&kgdb_setting_breakpoint); } EXPORT_SYMBOL_GPL(kgdb_breakpoint); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index fa034d2..0ed46f3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -259,7 +259,8 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, struct kprobe_insn_page *kip; list_for_each_entry(kip, &c->pages, list) { - long idx = ((long)slot - (long)kip->insns) / c->insn_size; + long idx = ((long)slot - (long)kip->insns) / + (c->insn_size * sizeof(kprobe_opcode_t)); if (idx >= 0 && idx < slots_per_page(c)) { WARN_ON(kip->slot_used[idx] != SLOT_USED); if (dirty) { diff --git a/kernel/kthread.c b/kernel/kthread.c index 82ed0ea..83911c78 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -219,7 +219,7 @@ int kthreadd(void *unused) set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, cpu_all_mask); - set_mems_allowed(node_possible_map); + set_mems_allowed(node_states[N_HIGH_MEMORY]); current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; diff --git a/kernel/latencytop.c b/kernel/latencytop.c index ca07c5c..877fb30 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -56,7 +56,6 @@ #include <linux/module.h> #include <linux/sched.h> #include <linux/list.h> -#include <linux/slab.h> #include <linux/stacktrace.h> static DEFINE_SPINLOCK(latency_lock); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 681bc2e..2594e1c 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -43,6 +43,7 @@ #include <linux/ftrace.h> #include <linux/stringify.h> #include <linux/bitops.h> +#include <linux/gfp.h> #include <asm/sections.h> @@ -582,9 +583,6 @@ static int static_obj(void *obj) unsigned long start = (unsigned long) &_stext, end = (unsigned long) &_end, addr = (unsigned long) obj; -#ifdef CONFIG_SMP - int i; -#endif /* * static variable? @@ -595,24 +593,16 @@ static int static_obj(void *obj) if (arch_is_kernel_data(addr)) return 1; -#ifdef CONFIG_SMP /* - * percpu var? + * in-kernel percpu var? */ - for_each_possible_cpu(i) { - start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); - end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM - + per_cpu_offset(i); - - if ((addr >= start) && (addr < end)) - return 1; - } -#endif + if (is_kernel_percpu_address(addr)) + return 1; /* - * module var? + * module static or percpu var? */ - return is_module_address(addr); + return is_module_address(addr) || is_module_percpu_address(addr); } /* @@ -3211,8 +3201,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; - trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); - if (unlikely(current->lockdep_recursion)) return; @@ -3220,6 +3208,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, check_flags(flags); current->lockdep_recursion = 1; + trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, irqs_disabled_flags(flags), nest_lock, ip, 0); current->lockdep_recursion = 0; @@ -3232,14 +3221,13 @@ void lock_release(struct lockdep_map *lock, int nested, { unsigned long flags; - trace_lock_release(lock, nested, ip); - if (unlikely(current->lockdep_recursion)) return; raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; + trace_lock_release(lock, nested, ip); __lock_release(lock, nested, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); @@ -3413,8 +3401,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - trace_lock_contended(lock, ip); - if (unlikely(!lock_stat)) return; @@ -3424,6 +3410,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; + trace_lock_contended(lock, ip); __lock_contended(lock, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); diff --git a/kernel/module.c b/kernel/module.c index c968d36..1016b75 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -370,27 +370,33 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -static void *percpu_modalloc(unsigned long size, unsigned long align, - const char *name) +static inline void __percpu *mod_percpu(struct module *mod) { - void *ptr; + return mod->percpu; +} +static int percpu_modalloc(struct module *mod, + unsigned long size, unsigned long align) +{ if (align > PAGE_SIZE) { printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", - name, align, PAGE_SIZE); + mod->name, align, PAGE_SIZE); align = PAGE_SIZE; } - ptr = __alloc_reserved_percpu(size, align); - if (!ptr) + mod->percpu = __alloc_reserved_percpu(size, align); + if (!mod->percpu) { printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n", size); - return ptr; + return -ENOMEM; + } + mod->percpu_size = size; + return 0; } -static void percpu_modfree(void *freeme) +static void percpu_modfree(struct module *mod) { - free_percpu(freeme); + free_percpu(mod->percpu); } static unsigned int find_pcpusec(Elf_Ehdr *hdr, @@ -400,24 +406,62 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr, return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); } -static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) +static void percpu_modcopy(struct module *mod, + const void *from, unsigned long size) { int cpu; for_each_possible_cpu(cpu) - memcpy(pcpudest + per_cpu_offset(cpu), from, size); + memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); +} + +/** + * is_module_percpu_address - test whether address is from module static percpu + * @addr: address to test + * + * Test whether @addr belongs to module static percpu area. + * + * RETURNS: + * %true if @addr is from module static percpu area + */ +bool is_module_percpu_address(unsigned long addr) +{ + struct module *mod; + unsigned int cpu; + + preempt_disable(); + + list_for_each_entry_rcu(mod, &modules, list) { + if (!mod->percpu_size) + continue; + for_each_possible_cpu(cpu) { + void *start = per_cpu_ptr(mod->percpu, cpu); + + if ((void *)addr >= start && + (void *)addr < start + mod->percpu_size) { + preempt_enable(); + return true; + } + } + } + + preempt_enable(); + return false; } #else /* ... !CONFIG_SMP */ -static inline void *percpu_modalloc(unsigned long size, unsigned long align, - const char *name) +static inline void __percpu *mod_percpu(struct module *mod) { return NULL; } -static inline void percpu_modfree(void *pcpuptr) +static inline int percpu_modalloc(struct module *mod, + unsigned long size, unsigned long align) +{ + return -ENOMEM; +} +static inline void percpu_modfree(struct module *mod) { - BUG(); } static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, @@ -425,12 +469,16 @@ static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, { return 0; } -static inline void percpu_modcopy(void *pcpudst, const void *src, - unsigned long size) +static inline void percpu_modcopy(struct module *mod, + const void *from, unsigned long size) { /* pcpusec should be 0, and size of that section should be 0. */ BUG_ON(size != 0); } +bool is_module_percpu_address(unsigned long addr) +{ + return false; +} #endif /* CONFIG_SMP */ @@ -473,11 +521,13 @@ static void module_unload_init(struct module *mod) int cpu; INIT_LIST_HEAD(&mod->modules_which_use_me); - for_each_possible_cpu(cpu) - per_cpu_ptr(mod->refptr, cpu)->count = 0; + for_each_possible_cpu(cpu) { + per_cpu_ptr(mod->refptr, cpu)->incs = 0; + per_cpu_ptr(mod->refptr, cpu)->decs = 0; + } /* Hold reference count during initialization. */ - __this_cpu_write(mod->refptr->count, 1); + __this_cpu_write(mod->refptr->incs, 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; } @@ -616,12 +666,28 @@ static int try_stop_module(struct module *mod, int flags, int *forced) unsigned int module_refcount(struct module *mod) { - unsigned int total = 0; + unsigned int incs = 0, decs = 0; int cpu; for_each_possible_cpu(cpu) - total += per_cpu_ptr(mod->refptr, cpu)->count; - return total; + decs += per_cpu_ptr(mod->refptr, cpu)->decs; + /* + * ensure the incs are added up after the decs. + * module_put ensures incs are visible before decs with smp_wmb. + * + * This 2-count scheme avoids the situation where the refcount + * for CPU0 is read, then CPU0 increments the module refcount, + * then CPU1 drops that refcount, then the refcount for CPU1 is + * read. We would record a decrement but not its corresponding + * increment so we would see a low count (disaster). + * + * Rare situation? But module_refcount can be preempted, and we + * might be tallying up 4096+ CPUs. So it is not impossible. + */ + smp_rmb(); + for_each_possible_cpu(cpu) + incs += per_cpu_ptr(mod->refptr, cpu)->incs; + return incs - decs; } EXPORT_SYMBOL(module_refcount); @@ -798,10 +864,11 @@ void module_put(struct module *module) { if (module) { preempt_disable(); - __this_cpu_dec(module->refptr->count); + smp_wmb(); /* see comment in module_refcount */ + __this_cpu_inc(module->refptr->decs); trace_module_put(module, _RET_IP_, - __this_cpu_read(module->refptr->count)); + __this_cpu_read(module->refptr->decs)); /* Maybe they're waiting for us to drop reference? */ if (unlikely(!module_is_live(module))) wake_up_process(module->waiter); @@ -1400,8 +1467,7 @@ static void free_module(struct module *mod) /* This may be NULL, but that's OK */ module_free(mod, mod->module_init); kfree(mod->args); - if (mod->percpu) - percpu_modfree(mod->percpu); + percpu_modfree(mod); #if defined(CONFIG_MODULE_UNLOAD) if (mod->refptr) free_percpu(mod->refptr); @@ -1520,7 +1586,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs, default: /* Divert to percpu allocation if a percpu var. */ if (sym[i].st_shndx == pcpuindex) - secbase = (unsigned long)mod->percpu; + secbase = (unsigned long)mod_percpu(mod); else secbase = sechdrs[sym[i].st_shndx].sh_addr; sym[i].st_value += secbase; @@ -1954,7 +2020,7 @@ static noinline struct module *load_module(void __user *umod, unsigned int modindex, versindex, infoindex, pcpuindex; struct module *mod; long err = 0; - void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ + void *ptr = NULL; /* Stops spurious gcc warning */ unsigned long symoffs, stroffs, *strmap; mm_segment_t old_fs; @@ -2094,15 +2160,11 @@ static noinline struct module *load_module(void __user *umod, if (pcpuindex) { /* We have a special allocation for this section. */ - percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, - sechdrs[pcpuindex].sh_addralign, - mod->name); - if (!percpu) { - err = -ENOMEM; + err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size, + sechdrs[pcpuindex].sh_addralign); + if (err) goto free_mod; - } sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; - mod->percpu = percpu; } /* Determine total sizes, and put offsets in sh_entsize. For now @@ -2317,7 +2379,7 @@ static noinline struct module *load_module(void __user *umod, sort_extable(mod->extable, mod->extable + mod->num_exentries); /* Finally, copy percpu area over. */ - percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, + percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr, sechdrs[pcpuindex].sh_size); add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, @@ -2409,8 +2471,7 @@ static noinline struct module *load_module(void __user *umod, module_free(mod, mod->module_core); /* mod will be freed with core. Don't access it beyond this line! */ free_percpu: - if (percpu) - percpu_modfree(percpu); + percpu_modfree(mod); free_mod: kfree(args); kfree(strmap); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 2ab6723..f74e6c0 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -13,6 +13,7 @@ * Pavel Emelianov <xemul@openvz.org> */ +#include <linux/slab.h> #include <linux/module.h> #include <linux/nsproxy.h> #include <linux/init_task.h> diff --git a/kernel/padata.c b/kernel/padata.c index 93caf65..fd03513 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -25,6 +25,7 @@ #include <linux/padata.h> #include <linux/mutex.h> #include <linux/sched.h> +#include <linux/slab.h> #include <linux/rcupdate.h> #define MAX_SEQ_NR INT_MAX - NR_CPUS diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 4393b9e..3d1552d 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -15,6 +15,7 @@ #include <linux/smp.h> #include <linux/file.h> #include <linux/poll.h> +#include <linux/slab.h> #include <linux/sysfs.h> #include <linux/dcache.h> #include <linux/percpu.h> @@ -81,10 +82,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } -void __weak hw_perf_event_setup(int cpu) { barrier(); } -void __weak hw_perf_event_setup_online(int cpu) { barrier(); } -void __weak hw_perf_event_setup_offline(int cpu) { barrier(); } - int __weak hw_perf_group_sched_in(struct perf_event *group_leader, struct perf_cpu_context *cpuctx, @@ -97,25 +94,15 @@ void __weak perf_event_print_debug(void) { } static DEFINE_PER_CPU(int, perf_disable_count); -void __perf_disable(void) -{ - __get_cpu_var(perf_disable_count)++; -} - -bool __perf_enable(void) -{ - return !--__get_cpu_var(perf_disable_count); -} - void perf_disable(void) { - __perf_disable(); - hw_perf_disable(); + if (!__get_cpu_var(perf_disable_count)++) + hw_perf_disable(); } void perf_enable(void) { - if (__perf_enable()) + if (!--__get_cpu_var(perf_disable_count)) hw_perf_enable(); } @@ -1178,11 +1165,9 @@ void perf_event_task_sched_out(struct task_struct *task, struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent; - struct pt_regs *regs; int do_switch = 1; - regs = task_pt_regs(task); - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); if (likely(!ctx || !cpuctx->task_ctx)) return; @@ -1538,12 +1523,15 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) */ if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(event, 1); + perf_disable(); event->pmu->unthrottle(event); + perf_enable(); } if (!event->attr.freq || !event->attr.sample_freq) continue; + perf_disable(); event->pmu->read(event); now = atomic64_read(&event->count); delta = now - hwc->freq_count_stamp; @@ -1551,6 +1539,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) if (delta > 0) perf_adjust_period(event, TICK_NSEC, delta); + perf_enable(); } raw_spin_unlock(&ctx->lock); } @@ -1560,9 +1549,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) */ static void rotate_ctx(struct perf_event_context *ctx) { - if (!ctx->nr_events) - return; - raw_spin_lock(&ctx->lock); /* Rotate the first entry last of non-pinned groups */ @@ -1575,19 +1561,28 @@ void perf_event_task_tick(struct task_struct *curr) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; + int rotate = 0; if (!atomic_read(&nr_events)) return; cpuctx = &__get_cpu_var(perf_cpu_context); - ctx = curr->perf_event_ctxp; + if (cpuctx->ctx.nr_events && + cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) + rotate = 1; - perf_disable(); + ctx = curr->perf_event_ctxp; + if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) + rotate = 1; perf_ctx_adjust_freq(&cpuctx->ctx); if (ctx) perf_ctx_adjust_freq(ctx); + if (!rotate) + return; + + perf_disable(); cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (ctx) task_ctx_sched_out(ctx, EVENT_FLEXIBLE); @@ -1599,7 +1594,6 @@ void perf_event_task_tick(struct task_struct *curr) cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) task_ctx_sched_in(curr, EVENT_FLEXIBLE); - perf_enable(); } @@ -2791,6 +2785,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return NULL; } +__weak +void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) +{ +} + + /* * Output */ @@ -3376,15 +3376,23 @@ static void perf_event_task_output(struct perf_event *event, struct perf_task_event *task_event) { struct perf_output_handle handle; - int size; struct task_struct *task = task_event->task; - int ret; + unsigned long flags; + int size, ret; + + /* + * If this CPU attempts to acquire an rq lock held by a CPU spinning + * in perf_output_lock() from interrupt context, it's game over. + */ + local_irq_save(flags); size = task_event->event_id.header.size; ret = perf_output_begin(&handle, event, size, 0, 0); - if (ret) + if (ret) { + local_irq_restore(flags); return; + } task_event->event_id.pid = perf_event_pid(event, task); task_event->event_id.ppid = perf_event_pid(event, current); @@ -3395,6 +3403,7 @@ static void perf_event_task_output(struct perf_event *event, perf_output_put(&handle, task_event->event_id); perf_output_end(&handle); + local_irq_restore(flags); } static int perf_event_task_match(struct perf_event *event) @@ -4318,9 +4327,8 @@ static const struct pmu perf_ops_task_clock = { #ifdef CONFIG_EVENT_TRACING void perf_tp_event(int event_id, u64 addr, u64 count, void *record, - int entry_size) + int entry_size, struct pt_regs *regs) { - struct pt_regs *regs = get_irq_regs(); struct perf_sample_data data; struct perf_raw_record raw = { .size = entry_size, @@ -4330,12 +4338,9 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record, perf_sample_data_init(&data, addr); data.raw = &raw; - if (!regs) - regs = task_pt_regs(current); - /* Trace events already protected against recursion */ do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, - &data, regs); + &data, regs); } EXPORT_SYMBOL_GPL(perf_tp_event); @@ -4351,7 +4356,7 @@ static int perf_tp_event_match(struct perf_event *event, static void tp_perf_event_destroy(struct perf_event *event) { - ftrace_profile_disable(event->attr.config); + perf_trace_disable(event->attr.config); } static const struct pmu *tp_perf_event_init(struct perf_event *event) @@ -4365,7 +4370,7 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (ftrace_profile_enable(event->attr.config)) + if (perf_trace_enable(event->attr.config)) return NULL; event->destroy = tp_perf_event_destroy; @@ -4892,7 +4897,7 @@ err_fput_free_put_context: err_free_put_context: if (err < 0) - kfree(event); + free_event(event); err_put_context: if (err < 0) @@ -5372,18 +5377,26 @@ int perf_event_init_task(struct task_struct *child) return ret; } +static void __init perf_event_init_all_cpus(void) +{ + int cpu; + struct perf_cpu_context *cpuctx; + + for_each_possible_cpu(cpu) { + cpuctx = &per_cpu(perf_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx, NULL); + } +} + static void __cpuinit perf_event_init_cpu(int cpu) { struct perf_cpu_context *cpuctx; cpuctx = &per_cpu(perf_cpu_context, cpu); - __perf_event_init_context(&cpuctx->ctx, NULL); spin_lock(&perf_resource_lock); cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; spin_unlock(&perf_resource_lock); - - hw_perf_event_setup(cpu); } #ifdef CONFIG_HOTPLUG_CPU @@ -5423,20 +5436,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) perf_event_init_cpu(cpu); break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - hw_perf_event_setup_online(cpu); - break; - case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: perf_event_exit_cpu(cpu); break; - case CPU_DEAD: - hw_perf_event_setup_offline(cpu); - break; - default: break; } @@ -5454,6 +5458,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = { void __init perf_event_init(void) { + perf_event_init_all_cpus(); perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 79aac93..a5aff94 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -13,6 +13,7 @@ #include <linux/syscalls.h> #include <linux/err.h> #include <linux/acct.h> +#include <linux/slab.h> #define BITS_PER_PAGE (PAGE_SIZE*8) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 564b3b0..799f360 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -997,9 +997,9 @@ static void check_thread_timers(struct task_struct *tsk, } } -static void stop_process_timers(struct task_struct *tsk) +static void stop_process_timers(struct signal_struct *sig) { - struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = &sig->cputimer; unsigned long flags; if (!cputimer->running) @@ -1008,6 +1008,10 @@ static void stop_process_timers(struct task_struct *tsk) spin_lock_irqsave(&cputimer->lock, flags); cputimer->running = 0; spin_unlock_irqrestore(&cputimer->lock, flags); + + sig->cputime_expires.prof_exp = cputime_zero; + sig->cputime_expires.virt_exp = cputime_zero; + sig->cputime_expires.sched_exp = 0; } static u32 onecputick; @@ -1069,7 +1073,7 @@ static void check_process_timers(struct task_struct *tsk, list_empty(&timers[CPUCLOCK_VIRT]) && cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && list_empty(&timers[CPUCLOCK_SCHED])) { - stop_process_timers(tsk); + stop_process_timers(sig); return; } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index da5288e..aa9e916 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -22,6 +22,7 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> +#include <linux/gfp.h> #include <scsi/scsi_scan.h> #include <asm/suspend.h> diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c index 39ac698..fdcad9e 100644 --- a/kernel/power/hibernate_nvs.c +++ b/kernel/power/hibernate_nvs.c @@ -10,6 +10,7 @@ #include <linux/kernel.h> #include <linux/list.h> #include <linux/mm.h> +#include <linux/slab.h> #include <linux/suspend.h> /* diff --git a/kernel/power/process.c b/kernel/power/process.c index 5ade1bd..71ae290 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -88,12 +88,11 @@ static int try_to_freeze_tasks(bool sig_only) printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " "(%d tasks refusing to freeze):\n", elapsed_csecs / 100, elapsed_csecs % 100, todo); - show_state(); read_lock(&tasklist_lock); do_each_thread(g, p) { task_lock(p); if (freezing(p) && !freezer_should_skip(p)) - printk(KERN_ERR " %s\n", p->comm); + sched_show_task(p); cancel_freezing(p); task_unlock(p); } while_each_thread(g, p); @@ -145,7 +144,7 @@ static void thaw_tasks(bool nosig_only) if (nosig_only && should_send_signal(p)) continue; - if (cgroup_frozen(p)) + if (cgroup_freezing_or_frozen(p)) continue; thaw_process(p); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 830cade..be861c2 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -26,6 +26,7 @@ #include <linux/console.h> #include <linux/highmem.h> #include <linux/list.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 44cce10..56e7dbb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -15,6 +15,7 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/syscalls.h> +#include <linux/gfp.h> #include "power.h" diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 1d57573..66824d7 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -23,6 +23,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pm.h> +#include <linux/slab.h> #include "power.h" diff --git a/kernel/power/user.c b/kernel/power/user.c index 4d22896..a8c9621 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -420,7 +420,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, * User space encodes device types as two-byte values, * so we need to recode them */ - swdev = old_decode_dev(swap_area.dev); + swdev = new_decode_dev(swap_area.dev); if (swdev) { offset = swap_area.offset; data->swap = swap_type_of(swdev, offset, NULL); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f1125c1..49d808e 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -45,6 +45,7 @@ #include <linux/mutex.h> #include <linux/module.h> #include <linux/kernel_stat.h> +#include <linux/hardirq.h> #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -66,6 +67,35 @@ EXPORT_SYMBOL_GPL(rcu_sched_lock_map); int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +int debug_lockdep_rcu_enabled(void) +{ + return rcu_scheduler_active && debug_locks && + current->lockdep_recursion == 0; +} +EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); + +/** + * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? + * + * Check for bottom half being disabled, which covers both the + * CONFIG_PROVE_RCU and not cases. Note that if someone uses + * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) + * will show the situation. + * + * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. + */ +int rcu_read_lock_bh_held(void) +{ + if (!debug_lockdep_rcu_enabled()) + return 1; + return in_softirq(); +} +EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); + +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + /* * This function is invoked towards the end of the scheduler's initialization * process. Before this is called, the idle task might contain @@ -92,3 +122,14 @@ void wakeme_after_rcu(struct rcu_head *head) rcu = container_of(head, struct rcu_synchronize, head); complete(&rcu->completion); } + +#ifdef CONFIG_PROVE_RCU +/* + * wrapper function to avoid #include problems. + */ +int rcu_my_thread_group_empty(void) +{ + return thread_group_empty(current); +} +EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); +#endif /* #ifdef CONFIG_PROVE_RCU */ diff --git a/kernel/res_counter.c b/kernel/res_counter.c index bcdabf3..c7eaa37 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -10,7 +10,6 @@ #include <linux/types.h> #include <linux/parser.h> #include <linux/fs.h> -#include <linux/slab.h> #include <linux/res_counter.h> #include <linux/uaccess.h> #include <linux/mm.h> diff --git a/kernel/resource.c b/kernel/resource.c index 2d5be5d..9c358e2 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -219,19 +219,34 @@ void release_child_resources(struct resource *r) } /** - * request_resource - request and reserve an I/O or memory resource + * request_resource_conflict - request and reserve an I/O or memory resource * @root: root resource descriptor * @new: resource descriptor desired by caller * - * Returns 0 for success, negative error code on error. + * Returns 0 for success, conflict resource on error. */ -int request_resource(struct resource *root, struct resource *new) +struct resource *request_resource_conflict(struct resource *root, struct resource *new) { struct resource *conflict; write_lock(&resource_lock); conflict = __request_resource(root, new); write_unlock(&resource_lock); + return conflict; +} + +/** + * request_resource - request and reserve an I/O or memory resource + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * + * Returns 0 for success, negative error code on error. + */ +int request_resource(struct resource *root, struct resource *new) +{ + struct resource *conflict; + + conflict = request_resource_conflict(root, new); return conflict ? -EBUSY : 0; } @@ -474,25 +489,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou } /** - * insert_resource - Inserts a resource in the resource tree + * insert_resource_conflict - Inserts resource in the resource tree * @parent: parent of the new resource * @new: new resource to insert * - * Returns 0 on success, -EBUSY if the resource can't be inserted. + * Returns 0 on success, conflict resource if the resource can't be inserted. * - * This function is equivalent to request_resource when no conflict + * This function is equivalent to request_resource_conflict when no conflict * happens. If a conflict happens, and the conflicting resources * entirely fit within the range of the new resource, then the new * resource is inserted and the conflicting resources become children of * the new resource. */ -int insert_resource(struct resource *parent, struct resource *new) +struct resource *insert_resource_conflict(struct resource *parent, struct resource *new) { struct resource *conflict; write_lock(&resource_lock); conflict = __insert_resource(parent, new); write_unlock(&resource_lock); + return conflict; +} + +/** + * insert_resource - Inserts a resource in the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Returns 0 on success, -EBUSY if the resource can't be inserted. + */ +int insert_resource(struct resource *parent, struct resource *new) +{ + struct resource *conflict; + + conflict = insert_resource_conflict(parent, new); return conflict ? -EBUSY : 0; } diff --git a/kernel/sched.c b/kernel/sched.c index 9ab3cd7..3c2a54f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -71,6 +71,7 @@ #include <linux/debugfs.h> #include <linux/ctype.h> #include <linux/ftrace.h> +#include <linux/slab.h> #include <asm/tlb.h> #include <asm/irq_regs.h> @@ -322,6 +323,15 @@ static inline struct task_group *task_group(struct task_struct *p) /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { + /* + * Strictly speaking this rcu_read_lock() is not needed since the + * task_group is tied to the cgroup, which in turn can never go away + * as long as there are tasks attached to it. + * + * However since task_group() uses task_subsys_state() which is an + * rcu_dereference() user, this quiets CONFIG_PROVE_RCU. + */ + rcu_read_lock(); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; p->se.parent = task_group(p)->se[cpu]; @@ -331,6 +341,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) p->rt.rt_rq = task_group(p)->rt_rq[cpu]; p->rt.parent = task_group(p)->rt_se[cpu]; #endif + rcu_read_unlock(); } #else @@ -2650,7 +2661,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; - int cpu = get_cpu(); + int cpu __maybe_unused = get_cpu(); #ifdef CONFIG_SMP /* @@ -3779,7 +3790,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * the mutex owner just released it and exited. */ if (probe_kernel_address(&owner->cpu, cpu)) - goto out; + return 0; #else cpu = owner->cpu; #endif @@ -3789,14 +3800,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * the cpu field may no longer be valid. */ if (cpu >= nr_cpumask_bits) - goto out; + return 0; /* * We need to validate that we can do a * get_cpu() and that we have the percpu area. */ if (!cpu_online(cpu)) - goto out; + return 0; rq = cpu_rq(cpu); @@ -3815,7 +3826,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) cpu_relax(); } -out: + return 1; } #endif @@ -4902,7 +4913,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, int ret; cpumask_var_t mask; - if (len < cpumask_size()) + if ((len * BITS_PER_BYTE) < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(unsigned long)-1)) return -EINVAL; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) @@ -4910,10 +4923,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ret = sched_getaffinity(pid, mask); if (ret == 0) { - if (copy_to_user(user_mask_ptr, mask, cpumask_size())) + size_t retlen = min_t(size_t, len, cpumask_size()); + + if (copy_to_user(user_mask_ptr, mask, retlen)) ret = -EFAULT; else - ret = cpumask_size(); + ret = retlen; } free_cpumask_var(mask); @@ -5383,7 +5398,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) get_task_struct(mt); task_rq_unlock(rq, &flags); - wake_up_process(rq->migration_thread); + wake_up_process(mt); put_task_struct(mt); wait_for_completion(&req.done); tlb_migrate_finish(p->mm); diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index fccf9fb..e6871cb 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -27,6 +27,7 @@ * of the License. */ +#include <linux/gfp.h> #include "sched_cpupri.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 67f95aa..19be00b 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { char path[64]; + rcu_read_lock(); cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); + rcu_read_unlock(); SEQ_printf(m, " %s", path); } #endif @@ -518,8 +520,4 @@ void proc_sched_set_task(struct task_struct *p) p->se.nr_wakeups_idle = 0; p->sched_info.bkl_count = 0; #endif - p->se.sum_exec_runtime = 0; - p->se.prev_sum_exec_runtime = 0; - p->nvcsw = 0; - p->nivcsw = 0; } diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 7494bbf..7d3f4fa 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -637,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, goto cancelled; /* the timer holds a reference whilst it is pending */ - ret = work->ops->get_ref(work); + ret = slow_work_get_ref(work); if (ret < 0) goto cant_get_ref; diff --git a/kernel/slow-work.h b/kernel/slow-work.h index 321f3c5..a29ebd1 100644 --- a/kernel/slow-work.h +++ b/kernel/slow-work.h @@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); */ static inline void slow_work_set_thread_pid(int id, pid_t pid) { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG slow_work_pids[id] = pid; #endif } static inline void slow_work_mark_time(struct slow_work *work) { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG work->mark = CURRENT_TIME; #endif } static inline void slow_work_begin_exec(int id, struct slow_work *work) { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG slow_work_execs[id] = work; #endif } static inline void slow_work_end_exec(int id, struct slow_work *work) { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG write_lock(&slow_work_execs_lock); slow_work_execs[id] = NULL; write_unlock(&slow_work_execs_lock); diff --git a/kernel/smp.c b/kernel/smp.c index 9867b6b..3fc6973 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/percpu.h> #include <linux/init.h> +#include <linux/gfp.h> #include <linux/smp.h> #include <linux/cpu.h> diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 0d4c789..4b493f6 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -155,11 +155,11 @@ void softlockup_tick(void) * Wake up the high-prio watchdog task twice per * threshold timespan. */ - if (now > touch_ts + softlockup_thresh/2) + if (time_after(now - softlockup_thresh/2, touch_ts)) wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); /* Warn about unreasonable delays: */ - if (now <= (touch_ts + softlockup_thresh)) + if (time_before_eq(now - softlockup_thresh, touch_ts)) return; per_cpu(softlockup_print_ts, this_cpu) = touch_ts; diff --git a/kernel/srcu.c b/kernel/srcu.c index bde4295..2980da3 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -30,7 +30,6 @@ #include <linux/preempt.h> #include <linux/rcupdate.h> #include <linux/sched.h> -#include <linux/slab.h> #include <linux/smp.h> #include <linux/srcu.h> diff --git a/kernel/sys.c b/kernel/sys.c index 8298878f..7cb426a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -36,6 +36,7 @@ #include <linux/personality.h> #include <linux/ptrace.h> #include <linux/fs_struct.h> +#include <linux/gfp.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -1117,7 +1118,7 @@ DECLARE_RWSEM(uts_sem); #ifdef COMPAT_UTS_MACHINE #define override_architecture(name) \ - (current->personality == PER_LINUX32 && \ + (personality(current->personality) == PER_LINUX32 && \ copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ sizeof(COMPAT_UTS_MACHINE))) #else diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 8cd50d8..5903057 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -13,6 +13,7 @@ #include <linux/file.h> #include <linux/ctype.h> #include <linux/netdevice.h> +#include <linux/slab.h> #ifdef CONFIG_SYSCTL_SYSCALL diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 899ca51..11281d5 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -22,6 +22,7 @@ #include <linux/delayacct.h> #include <linux/cpumask.h> #include <linux/percpu.h> +#include <linux/slab.h> #include <linux/cgroupstats.h> #include <linux/cgroup.h> #include <linux/fs.h> diff --git a/kernel/time.c b/kernel/time.c index 2358a36..50612fa 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -35,7 +35,6 @@ #include <linux/syscalls.h> #include <linux/security.h> #include <linux/fs.h> -#include <linux/slab.h> #include <linux/math64.h> #include <linux/ptrace.h> diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 0a8a213..aada0e5 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -22,6 +22,29 @@ #include "tick-internal.h" +/* Limit min_delta to a jiffie */ +#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) + +static int tick_increase_min_delta(struct clock_event_device *dev) +{ + /* Nothing to do if we already reached the limit */ + if (dev->min_delta_ns >= MIN_DELTA_LIMIT) + return -ETIME; + + if (dev->min_delta_ns < 5000) + dev->min_delta_ns = 5000; + else + dev->min_delta_ns += dev->min_delta_ns >> 1; + + if (dev->min_delta_ns > MIN_DELTA_LIMIT) + dev->min_delta_ns = MIN_DELTA_LIMIT; + + printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", + dev->name ? dev->name : "?", + (unsigned long long) dev->min_delta_ns); + return 0; +} + /** * tick_program_event internal worker function */ @@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, if (!ret || !force) return ret; + dev->retries++; /* - * We tried 2 times to program the device with the given - * min_delta_ns. If that's not working then we double it + * We tried 3 times to program the device with the given + * min_delta_ns. If that's not working then we increase it * and emit a warning. */ if (++i > 2) { /* Increase the min. delta and try again */ - if (!dev->min_delta_ns) - dev->min_delta_ns = 5000; - else - dev->min_delta_ns += dev->min_delta_ns >> 1; - - printk(KERN_WARNING - "CE: %s increasing min_delta_ns to %llu nsec\n", - dev->name ? dev->name : "?", - (unsigned long long) dev->min_delta_ns << 1); - + if (tick_increase_min_delta(dev)) { + /* + * Get out of the loop if min_delta_ns + * hit the limit already. That's + * better than staying here forever. + * + * We clear next_event so we have a + * chance that the box survives. + */ + printk(KERN_WARNING + "CE: Reprogramming failure. Giving up\n"); + dev->next_event.tv64 = KTIME_MAX; + return -ETIME; + } i = 0; } diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c index 12f5c55..ac38fbb 100644 --- a/kernel/time/timecompare.c +++ b/kernel/time/timecompare.c @@ -19,6 +19,7 @@ #include <linux/timecompare.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/math64.h> /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1137f24..caf8d4d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -806,7 +806,8 @@ void update_wall_time(void) shift = min(shift, maxshift); while (offset >= timekeeper.cycle_interval) { offset = logarithmic_accumulation(offset, shift); - shift--; + if(offset < timekeeper.cycle_interval<<shift) + shift--; } /* correct the clock when NTP error is too big */ diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index bdfb8dd..1a4a7dd 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -228,6 +228,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) SEQ_printf(m, " event_handler: "); print_name_offset(m, dev->event_handler); SEQ_printf(m, "\n"); + SEQ_printf(m, " retries: %lu\n", dev->retries); } static void timer_list_show_tickdevices(struct seq_file *m) @@ -257,7 +258,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.5\n"); + SEQ_printf(m, "Timer List Version: v0.6\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); diff --git a/kernel/timer.c b/kernel/timer.c index 49773f3..9199f3c 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -39,6 +39,7 @@ #include <linux/kallsyms.h> #include <linux/perf_event.h> #include <linux/sched.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -936,6 +937,7 @@ int try_to_del_timer_sync(struct timer_list *timer) if (base->running_timer == timer) goto out; + timer_stats_timer_clear_start_info(timer); ret = 0; if (timer_pending(timer)) { detach_timer(timer, 1); diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d00c6fe..78edc64 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o ifeq ($(CONFIG_PERF_EVENTS),y) -obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o +obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 07f945a9..b3bc91a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -21,6 +21,7 @@ #include <linux/percpu.h> #include <linux/init.h> #include <linux/mutex.h> +#include <linux/slab.h> #include <linux/debugfs.h> #include <linux/smp_lock.h> #include <linux/time.h> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d9062f5..2404b59 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -24,6 +24,7 @@ #include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/sysctl.h> +#include <linux/slab.h> #include <linux/ctype.h> #include <linux/list.h> #include <linux/hash.h> diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 9f4f565..a22582a 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -9,7 +9,6 @@ #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/module.h> -#include <linux/slab.h> #define CREATE_TRACE_POINTS #include <trace/events/power.h> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 05a9f83..41ca394 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/percpu.h> #include <linux/mutex.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/list.h> @@ -207,6 +208,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on); #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ +#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -1201,18 +1210,19 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) for (i = 0; i < nr_pages; i++) { if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) - return; + goto out; p = cpu_buffer->pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); free_buffer_page(bpage); } if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) - return; + goto out; rb_reset_cpu(cpu_buffer); rb_check_pages(cpu_buffer); +out: spin_unlock_irq(&cpu_buffer->reader_lock); } @@ -1229,7 +1239,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, for (i = 0; i < nr_pages; i++) { if (RB_WARN_ON(cpu_buffer, list_empty(pages))) - return; + goto out; p = pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); @@ -1238,6 +1248,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_reset_cpu(cpu_buffer); rb_check_pages(cpu_buffer); +out: spin_unlock_irq(&cpu_buffer->reader_lock); } @@ -1547,7 +1558,7 @@ rb_update_event(struct ring_buffer_event *event, case 0: length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA) + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) event->array[0] = length; else event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); @@ -1722,11 +1733,11 @@ static unsigned rb_calculate_event_length(unsigned length) if (!length) length = 1; - if (length > RB_MAX_SMALL_DATA) + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) length += sizeof(event.array[0]); length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ALIGNMENT); + length = ALIGN(length, RB_ARCH_ALIGNMENT); return length; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3ec2ee6..44f916a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -33,10 +33,10 @@ #include <linux/kdebug.h> #include <linux/string.h> #include <linux/rwsem.h> +#include <linux/slab.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/poll.h> -#include <linux/gfp.h> #include <linux/fs.h> #include "trace.h" diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 6fbfb8f..9d589d8 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -84,7 +84,7 @@ u64 notrace trace_clock_global(void) int this_cpu; u64 now; - raw_local_irq_save(flags); + local_irq_save(flags); this_cpu = raw_smp_processor_id(); now = cpu_clock(this_cpu); @@ -110,7 +110,7 @@ u64 notrace trace_clock_global(void) arch_spin_unlock(&trace_clock_struct.lock); out: - raw_local_irq_restore(flags); + local_irq_restore(flags); return now; } diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_perf.c index c1cc3ab..0565bb4 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_perf.c @@ -1,32 +1,41 @@ /* - * trace event based perf counter profiling + * trace event based perf event profiling/tracing * * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com> - * + * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com> */ #include <linux/module.h> #include <linux/kprobes.h> #include "trace.h" +DEFINE_PER_CPU(struct pt_regs, perf_trace_regs); +EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs); + +EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); static char *perf_trace_buf; static char *perf_trace_buf_nmi; -typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; +/* + * Force it to be aligned to unsigned long to avoid misaligned accesses + * suprises + */ +typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) + perf_trace_t; /* Count the events in use (per event id, not per instance) */ -static int total_profile_count; +static int total_ref_count; -static int ftrace_profile_enable_event(struct ftrace_event_call *event) +static int perf_trace_event_enable(struct ftrace_event_call *event) { char *buf; int ret = -ENOMEM; - if (event->profile_count++ > 0) + if (event->perf_refcount++ > 0) return 0; - if (!total_profile_count) { + if (!total_ref_count) { buf = (char *)alloc_percpu(perf_trace_t); if (!buf) goto fail_buf; @@ -40,35 +49,35 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) rcu_assign_pointer(perf_trace_buf_nmi, buf); } - ret = event->profile_enable(event); + ret = event->perf_event_enable(event); if (!ret) { - total_profile_count++; + total_ref_count++; return 0; } fail_buf_nmi: - if (!total_profile_count) { + if (!total_ref_count) { free_percpu(perf_trace_buf_nmi); free_percpu(perf_trace_buf); perf_trace_buf_nmi = NULL; perf_trace_buf = NULL; } fail_buf: - event->profile_count--; + event->perf_refcount--; return ret; } -int ftrace_profile_enable(int event_id) +int perf_trace_enable(int event_id) { struct ftrace_event_call *event; int ret = -EINVAL; mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id && event->profile_enable && + if (event->id == event_id && event->perf_event_enable && try_module_get(event->mod)) { - ret = ftrace_profile_enable_event(event); + ret = perf_trace_event_enable(event); break; } } @@ -77,16 +86,16 @@ int ftrace_profile_enable(int event_id) return ret; } -static void ftrace_profile_disable_event(struct ftrace_event_call *event) +static void perf_trace_event_disable(struct ftrace_event_call *event) { char *buf, *nmi_buf; - if (--event->profile_count > 0) + if (--event->perf_refcount > 0) return; - event->profile_disable(event); + event->perf_event_disable(event); - if (!--total_profile_count) { + if (!--total_ref_count) { buf = perf_trace_buf; rcu_assign_pointer(perf_trace_buf, NULL); @@ -104,14 +113,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event) } } -void ftrace_profile_disable(int event_id) +void perf_trace_disable(int event_id) { struct ftrace_event_call *event; mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id) { - ftrace_profile_disable_event(event); + perf_trace_event_disable(event); module_put(event->mod); break; } @@ -119,13 +128,15 @@ void ftrace_profile_disable(int event_id) mutex_unlock(&event_mutex); } -__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, - int *rctxp, unsigned long *irq_flags) +__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, + int *rctxp, unsigned long *irq_flags) { struct trace_entry *entry; char *trace_buf, *raw_data; int pc, cpu; + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); + pc = preempt_count(); /* Protect the per cpu buffer, begin the rcu read side */ @@ -148,7 +159,7 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, raw_data = per_cpu_ptr(trace_buf, cpu); /* zero the dead bytes from align to not leak stack to user */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); entry = (struct trace_entry *)raw_data; tracing_generic_entry_update(entry, *irq_flags, pc); @@ -161,4 +172,4 @@ err_recursion: local_irq_restore(*irq_flags); return NULL; } -EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare); +EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3f972ad9..c697c70 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,6 +15,7 @@ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> +#include <linux/slab.h> #include <linux/delay.h> #include <asm/setup.h> @@ -938,7 +939,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, trace_create_file("enable", 0644, call->dir, call, enable); - if (call->id && call->profile_enable) + if (call->id && call->perf_event_enable) trace_create_file("id", 0444, call->dir, call, id); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 4615f62..88c0b6d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -22,6 +22,7 @@ #include <linux/ctype.h> #include <linux/mutex.h> #include <linux/perf_event.h> +#include <linux/slab.h> #include "trace.h" #include "trace_output.h" diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e6989d9..9aed1a5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -9,6 +9,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/slab.h> #include <linux/fs.h> #include "trace.h" diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 505c922..1251e36 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1214,7 +1214,7 @@ static int set_print_fmt(struct trace_probe *tp) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static __kprobes void kprobe_profile_func(struct kprobe *kp, +static __kprobes void kprobe_perf_func(struct kprobe *kp, struct pt_regs *regs) { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); @@ -1227,11 +1227,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp, __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; - entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); + entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); if (!entry) return; @@ -1240,11 +1240,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp, for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); - ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags); + perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); } /* Kretprobe profile handler */ -static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, +static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); @@ -1257,11 +1257,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; - entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); + entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); if (!entry) return; @@ -1271,10 +1271,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); - ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags); + perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, + irq_flags, regs); } -static int probe_profile_enable(struct ftrace_event_call *call) +static int probe_perf_enable(struct ftrace_event_call *call) { struct trace_probe *tp = (struct trace_probe *)call->data; @@ -1286,7 +1287,7 @@ static int probe_profile_enable(struct ftrace_event_call *call) return enable_kprobe(&tp->rp.kp); } -static void probe_profile_disable(struct ftrace_event_call *call) +static void probe_perf_disable(struct ftrace_event_call *call) { struct trace_probe *tp = (struct trace_probe *)call->data; @@ -1311,7 +1312,7 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) kprobe_trace_func(kp, regs); #ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) - kprobe_profile_func(kp, regs); + kprobe_perf_func(kp, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1325,7 +1326,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) kretprobe_trace_func(ri, regs); #ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) - kretprobe_profile_func(ri, regs); + kretprobe_perf_func(ri, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1358,8 +1359,8 @@ static int register_probe_event(struct trace_probe *tp) call->unregfunc = probe_event_disable; #ifdef CONFIG_PERF_EVENTS - call->profile_enable = probe_profile_enable; - call->profile_disable = probe_profile_disable; + call->perf_event_enable = probe_perf_enable; + call->perf_event_disable = probe_perf_disable; #endif call->data = tp; ret = trace_add_event_call(call); diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 94103cd..d59cd68 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -23,6 +23,7 @@ #include <linux/debugfs.h> #include <linux/ftrace.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/fs.h> #include "trace_output.h" diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 0acd834..017fa37 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/mmiotrace.h> #include <linux/pci.h> +#include <linux/slab.h> #include <linux/time.h> #include <asm/atomic.h> diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 280fea4..81003b4 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -3,6 +3,7 @@ #include <linux/stringify.h> #include <linux/kthread.h> #include <linux/delay.h> +#include <linux/slab.h> static inline int trace_valid_entry(struct trace_entry *entry) { diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index a4bb239..96cffb2 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -10,6 +10,7 @@ #include <linux/list.h> +#include <linux/slab.h> #include <linux/rbtree.h> #include <linux/debugfs.h> #include "trace_stat.h" diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index cba47d7..4d6d711 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,5 +1,6 @@ #include <trace/syscall.h> #include <trace/events/syscalls.h> +#include <linux/slab.h> #include <linux/kernel.h> #include <linux/ftrace.h> #include <linux/perf_event.h> @@ -428,12 +429,12 @@ core_initcall(init_ftrace_syscalls); #ifdef CONFIG_PERF_EVENTS -static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); -static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); -static int sys_prof_refcount_enter; -static int sys_prof_refcount_exit; +static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); +static int sys_perf_refcount_enter; +static int sys_perf_refcount_exit; -static void prof_syscall_enter(struct pt_regs *regs, long id) +static void perf_syscall_enter(struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; @@ -443,7 +444,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) int size; syscall_nr = syscall_get_nr(current, regs); - if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) + if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -455,11 +456,11 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, - "profile buffer not large enough")) + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "perf buffer not large enough")) return; - rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size, + rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, sys_data->enter_event->id, &rctx, &flags); if (!rec) return; @@ -467,10 +468,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); + perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); } -int prof_sysenter_enable(struct ftrace_event_call *call) +int perf_sysenter_enable(struct ftrace_event_call *call) { int ret = 0; int num; @@ -478,34 +479,34 @@ int prof_sysenter_enable(struct ftrace_event_call *call) num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); - if (!sys_prof_refcount_enter) - ret = register_trace_sys_enter(prof_syscall_enter); + if (!sys_perf_refcount_enter) + ret = register_trace_sys_enter(perf_syscall_enter); if (ret) { pr_info("event trace: Could not activate" "syscall entry trace point"); } else { - set_bit(num, enabled_prof_enter_syscalls); - sys_prof_refcount_enter++; + set_bit(num, enabled_perf_enter_syscalls); + sys_perf_refcount_enter++; } mutex_unlock(&syscall_trace_lock); return ret; } -void prof_sysenter_disable(struct ftrace_event_call *call) +void perf_sysenter_disable(struct ftrace_event_call *call) { int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); - sys_prof_refcount_enter--; - clear_bit(num, enabled_prof_enter_syscalls); - if (!sys_prof_refcount_enter) - unregister_trace_sys_enter(prof_syscall_enter); + sys_perf_refcount_enter--; + clear_bit(num, enabled_perf_enter_syscalls); + if (!sys_perf_refcount_enter) + unregister_trace_sys_enter(perf_syscall_enter); mutex_unlock(&syscall_trace_lock); } -static void prof_syscall_exit(struct pt_regs *regs, long ret) +static void perf_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; @@ -515,7 +516,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) int size; syscall_nr = syscall_get_nr(current, regs); - if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) + if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -530,11 +531,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) * Impossible, but be paranoid with the future * How to put this check outside runtime? */ - if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, - "exit event has grown above profile buffer size")) + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "exit event has grown above perf buffer size")) return; - rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size, + rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, sys_data->exit_event->id, &rctx, &flags); if (!rec) return; @@ -542,10 +543,10 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); + perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); } -int prof_sysexit_enable(struct ftrace_event_call *call) +int perf_sysexit_enable(struct ftrace_event_call *call) { int ret = 0; int num; @@ -553,30 +554,30 @@ int prof_sysexit_enable(struct ftrace_event_call *call) num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); - if (!sys_prof_refcount_exit) - ret = register_trace_sys_exit(prof_syscall_exit); + if (!sys_perf_refcount_exit) + ret = register_trace_sys_exit(perf_syscall_exit); if (ret) { pr_info("event trace: Could not activate" "syscall exit trace point"); } else { - set_bit(num, enabled_prof_exit_syscalls); - sys_prof_refcount_exit++; + set_bit(num, enabled_perf_exit_syscalls); + sys_perf_refcount_exit++; } mutex_unlock(&syscall_trace_lock); return ret; } -void prof_sysexit_disable(struct ftrace_event_call *call) +void perf_sysexit_disable(struct ftrace_event_call *call) { int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); - sys_prof_refcount_exit--; - clear_bit(num, enabled_prof_exit_syscalls); - if (!sys_prof_refcount_exit) - unregister_trace_sys_exit(prof_syscall_exit); + sys_perf_refcount_exit--; + clear_bit(num, enabled_perf_exit_syscalls); + if (!sys_perf_refcount_exit) + unregister_trace_sys_exit(perf_syscall_exit); mutex_unlock(&syscall_trace_lock); } diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 40cafb0..cc2d2fa 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -9,6 +9,7 @@ #include <trace/events/workqueue.h> #include <linux/list.h> #include <linux/percpu.h> +#include <linux/slab.h> #include <linux/kref.h> #include "trace_stat.h" #include "trace.h" diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dee4865..5bfb213 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -774,7 +774,7 @@ void flush_delayed_work(struct delayed_work *dwork) { if (del_timer_sync(&dwork->timer)) { struct cpu_workqueue_struct *cwq; - cwq = wq_per_cpu(keventd_wq, get_cpu()); + cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); __queue_work(cwq, &dwork->work); put_cpu(); } |