diff options
Diffstat (limited to 'kernel')
52 files changed, 975 insertions, 532 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 0b5ff08..353d3fe 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o +obj-$(CONFIG_SMP) += smp.o ifneq ($(CONFIG_SMP),y) obj-y += up.o endif @@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ +obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o @@ -121,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h # config_data.h contains the same information as ikconfig.h but gzipped. # Info from config_data can be extracted from /proc/config* targets += config_data.gz -$(obj)/config_data.gz: .config FORCE +$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) quiet_cmd_ikconfiggz = IKCFG $@ diff --git a/kernel/audit.c b/kernel/audit.c index 77770a0..e495624 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -400,7 +400,7 @@ static void kauditd_send_skb(struct sk_buff *skb) if (err < 0) { BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); - audit_log_lost("auditd dissapeared\n"); + audit_log_lost("auditd disappeared\n"); audit_pid = 0; /* we might get lucky and get this in the next auditd */ audit_hold_skb(skb); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66a416b..b24d702 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); */ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); +static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); static const struct inode_operations cgroup_dir_inode_operations; @@ -860,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) iput(inode); } +static int cgroup_delete(const struct dentry *d) +{ + return 1; +} + static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -874,25 +880,29 @@ static void cgroup_clear_directory(struct dentry *dentry) struct list_head *node; BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); node = dentry->d_subdirs.next; while (node != &dentry->d_subdirs) { struct dentry *d = list_entry(node, struct dentry, d_u.d_child); + + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); list_del_init(node); if (d->d_inode) { /* This should never be called on a cgroup * directory with child cgroups */ BUG_ON(d->d_inode->i_mode & S_IFDIR); - d = dget_locked(d); - spin_unlock(&dcache_lock); + dget_dlock(d); + spin_unlock(&d->d_lock); + spin_unlock(&dentry->d_lock); d_delete(d); simple_unlink(dentry->d_inode, d); dput(d); - spin_lock(&dcache_lock); - } + spin_lock(&dentry->d_lock); + } else + spin_unlock(&d->d_lock); node = dentry->d_subdirs.next; } - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); } /* @@ -900,11 +910,16 @@ static void cgroup_clear_directory(struct dentry *dentry) */ static void cgroup_d_remove_dir(struct dentry *dentry) { + struct dentry *parent; + cgroup_clear_directory(dentry); - spin_lock(&dcache_lock); + parent = dentry->d_parent; + spin_lock(&parent->d_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); list_del_init(&dentry->d_u.d_child); - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); + spin_unlock(&parent->d_lock); remove_dir(dentry); } @@ -1440,6 +1455,11 @@ static int cgroup_set_super(struct super_block *sb, void *data) static int cgroup_get_rootdir(struct super_block *sb) { + static const struct dentry_operations cgroup_dops = { + .d_iput = cgroup_diput, + .d_delete = cgroup_delete, + }; + struct inode *inode = cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); struct dentry *dentry; @@ -1457,6 +1477,8 @@ static int cgroup_get_rootdir(struct super_block *sb) return -ENOMEM; } sb->s_root = dentry; + /* for everything else we want ->d_op set */ + sb->s_d_op = &cgroup_dops; return 0; } @@ -2180,12 +2202,20 @@ static const struct file_operations cgroup_file_operations = { }; static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = simple_lookup, + .lookup = cgroup_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, }; +static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + d_add(dentry, NULL); + return NULL; +} + /* * Check if a file is a control file */ @@ -2199,10 +2229,6 @@ static inline struct cftype *__file_cft(struct file *file) static int cgroup_create_file(struct dentry *dentry, mode_t mode, struct super_block *sb) { - static const struct dentry_operations cgroup_dops = { - .d_iput = cgroup_diput, - }; - struct inode *inode; if (!dentry) @@ -2228,7 +2254,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode, inode->i_size = 0; inode->i_fop = &cgroup_file_operations; } - dentry->d_op = &cgroup_dops; d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ return 0; @@ -3638,9 +3663,7 @@ again: list_del(&cgrp->sibling); cgroup_unlock_hierarchy(cgrp->root); - spin_lock(&cgrp->dentry->d_lock); d = dget(cgrp->dentry); - spin_unlock(&d->d_lock); cgroup_d_remove_dir(d); dput(d); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index a6e7297..bd3e8e2 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2914,7 +2914,7 @@ static void __init kdb_cmd_init(void) } } -/* Intialize kdb_printf, breakpoint tables and kdb state */ +/* Initialize kdb_printf, breakpoint tables and kdb state */ void __init kdb_init(int lvl) { static int kdb_init_lvl = KDB_NOT_INITIALIZED; diff --git a/kernel/exit.c b/kernel/exit.c index 676149a..f9a45eb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -69,7 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) list_del_rcu(&p->tasks); list_del_init(&p->sibling); - __get_cpu_var(process_counts)--; + __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_group); } @@ -994,6 +994,15 @@ NORET_TYPE void do_exit(long code) exit_fs(tsk); check_stack_usage(); exit_thread(); + + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + * + * because of cgroup mode, must be called before cgroup_exit() + */ + perf_event_exit_task(tsk); + cgroup_exit(tsk, 1); if (group_dead) @@ -1007,11 +1016,6 @@ NORET_TYPE void do_exit(long code) * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_event_exit_task(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA diff --git a/kernel/fork.c b/kernel/fork.c index 7d164e2..25e4291 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> #include <linux/oom.h> +#include <linux/khugepaged.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -169,15 +170,14 @@ EXPORT_SYMBOL(free_task); static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); + sched_autogroup_exit(sig); kmem_cache_free(signal_cachep, sig); } static inline void put_signal_struct(struct signal_struct *sig) { - if (atomic_dec_and_test(&sig->sigcnt)) { - sched_autogroup_exit(sig); + if (atomic_dec_and_test(&sig->sigcnt)) free_signal_struct(sig); - } } void __put_task_struct(struct task_struct *tsk) @@ -331,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) retval = ksm_fork(mm, oldmm); if (retval) goto out; + retval = khugepaged_fork(mm, oldmm); + if (retval) + goto out; prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { @@ -530,6 +533,9 @@ void __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(mm->pmd_huge_pte); +#endif free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -544,6 +550,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { @@ -670,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->token_priority = 0; mm->last_interval = 0; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + mm->pmd_huge_pte = NULL; +#endif + if (!mm_init(mm, tsk)) goto fail_nomem; @@ -911,6 +922,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; + sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); @@ -1286,7 +1298,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); - __get_cpu_var(process_counts)++; + __this_cpu_inc(process_counts); } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; @@ -1318,7 +1330,7 @@ bad_fork_cleanup_mm: } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) - put_signal_struct(p->signal); + free_signal_struct(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: @@ -1411,23 +1423,6 @@ long do_fork(unsigned long clone_flags, } /* - * We hope to recycle these flags after 2.6.26 - */ - if (unlikely(clone_flags & CLONE_STOPPED)) { - static int __read_mostly count = 100; - - if (count > 0 && printk_ratelimit()) { - char comm[TASK_COMM_LEN]; - - count--; - printk(KERN_INFO "fork(): process `%s' used deprecated " - "clone flags 0x%lx\n", - get_task_comm(comm, current), - clone_flags & CLONE_STOPPED); - } - } - - /* * When called from kernel_thread, don't do user tracing stuff. */ if (likely(user_mode(regs))) @@ -1465,16 +1460,7 @@ long do_fork(unsigned long clone_flags, */ p->flags &= ~PF_STARTING; - if (unlikely(clone_flags & CLONE_STOPPED)) { - /* - * We'll start up with an immediate SIGSTOP. - */ - sigaddset(&p->pending.signal, SIGSTOP); - set_tsk_thread_flag(p, TIF_SIGPENDING); - __set_task_state(p, TASK_STOPPED); - } else { - wake_up_new_task(p, clone_flags); - } + wake_up_new_task(p, clone_flags); tracehook_report_clone_complete(trace, regs, clone_flags, nr, p); diff --git a/kernel/freezer.c b/kernel/freezer.c index bd1d42b..66ecd2e 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only) } if (should_send_signal(p)) { - if (!signal_pending(p)) - fake_signal_wake_up(p); + fake_signal_wake_up(p); + /* + * fake_signal_wake_up() goes through p's scheduler + * lock and guarantees that TASK_STOPPED/TRACED -> + * TASK_RUNNING transition can't race with task state + * testing in try_to_freeze_tasks(). + */ } else if (sig_only) { return false; } else { diff --git a/kernel/futex.c b/kernel/futex.c index 3019b92..b766d28 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page; + struct page *page, *page_head; int err; /* @@ -265,11 +265,46 @@ again: if (err < 0) return err; - page = compound_head(page); - lock_page(page); - if (!page->mapping) { - unlock_page(page); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + page_head = page; + if (unlikely(PageTail(page))) { put_page(page); + /* serialize against __split_huge_page_splitting() */ + local_irq_disable(); + if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { + page_head = compound_head(page); + /* + * page_head is valid pointer but we must pin + * it before taking the PG_lock and/or + * PG_compound_lock. The moment we re-enable + * irqs __split_huge_page_splitting() can + * return and the head page can be freed from + * under us. We can't take the PG_lock and/or + * PG_compound_lock on a page that could be + * freed from under us. + */ + if (page != page_head) { + get_page(page_head); + put_page(page); + } + local_irq_enable(); + } else { + local_irq_enable(); + goto again; + } + } +#else + page_head = compound_head(page); + if (page != page_head) { + get_page(page_head); + put_page(page); + } +#endif + + lock_page(page_head); + if (!page_head->mapping) { + unlock_page(page_head); + put_page(page_head); goto again; } @@ -280,20 +315,20 @@ again: * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ - if (PageAnon(page)) { + if (PageAnon(page_head)) { key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; } else { key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = page->mapping->host; - key->shared.pgoff = page->index; + key->shared.inode = page_head->mapping->host; + key->shared.pgoff = page_head->index; } get_futex_key_refs(key); - unlock_page(page); - put_page(page); + unlock_page(page_head); + put_page(page_head); return 0; } @@ -791,10 +826,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* - * This happens when we have stolen the lock and the original - * pending owner did not enqueue itself back on the rt_mutex. - * Thats not a tragedy. We know that way, that a lock waiter - * is on the fly. We make the futex_q waiter the pending owner. + * It is possible that the next waiter (the one that brought + * this owner to the kernel) timed out and is no longer + * waiting on the lock. */ if (!new_owner) new_owner = this->task; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f2429fc..0c8d7c0 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -497,7 +497,7 @@ static inline int hrtimer_is_hres_enabled(void) */ static inline int hrtimer_hres_active(void) { - return __get_cpu_var(hrtimer_bases).hres_active; + return __this_cpu_read(hrtimer_bases.hres_active); } /* @@ -1745,7 +1745,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, } /* - * A NULL parameter means "inifinte" + * A NULL parameter means "infinite" */ if (!expires) { schedule(); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9988d03..282f202 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; } static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) { + int cpu; + desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; @@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_count = 0; desc->irqs_unhandled = 0; desc->name = NULL; - memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); + for_each_possible_cpu(cpu) + *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; desc_smp_init(desc, node); } @@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node) if (!desc) return NULL; /* allocate based on nr_cpu_ids */ - desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), - gfp, node); + desc->kstat_irqs = alloc_percpu(unsigned int); if (!desc->kstat_irqs) goto err_desc; @@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node) return desc; err_kstat: - kfree(desc->kstat_irqs); + free_percpu(desc->kstat_irqs); err_desc: kfree(desc); return NULL; @@ -166,7 +168,7 @@ static void free_desc(unsigned int irq) mutex_unlock(&sparse_irq_lock); free_masks(desc); - kfree(desc->kstat_irqs); + free_percpu(desc->kstat_irqs); kfree(desc); } @@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { } }; -static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; int __init early_irq_init(void) { int count, i, node = first_online_node; @@ -250,7 +251,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; - desc[i].kstat_irqs = kstat_irqs_all[i]; + /* TODO : do this allocation on-demand ... */ + desc[i].kstat_irqs = alloc_percpu(unsigned int); alloc_masks(desc + i, GFP_KERNEL, node); desc_smp_init(desc + i, node); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); @@ -275,6 +277,22 @@ static void free_desc(unsigned int irq) static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) { +#if defined(CONFIG_KSTAT_IRQS_ONDEMAND) + struct irq_desc *desc; + unsigned int i; + + for (i = 0; i < cnt; i++) { + desc = irq_to_desc(start + i); + if (desc && !desc->kstat_irqs) { + unsigned int __percpu *stats = alloc_percpu(unsigned int); + + if (!stats) + return -1; + if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL) + free_percpu(stats); + } + } +#endif return start; } #endif /* !CONFIG_SPARSE_IRQ */ @@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq) unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); - return desc ? desc->kstat_irqs[cpu] : 0; + + return desc && desc->kstat_irqs ? + *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } #ifdef CONFIG_GENERIC_HARDIRQS @@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq) int cpu; int sum = 0; - if (!desc) + if (!desc || !desc->kstat_irqs) return 0; for_each_possible_cpu(cpu) - sum += desc->kstat_irqs[cpu]; + sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; } #endif /* CONFIG_GENERIC_HARDIRQS */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 91a5fa2..0caa59f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -577,7 +577,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } */ static int irq_thread(void *data) { - static struct sched_param param = { + static const struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; struct irqaction *action = data; diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 90f8819..c58fa7d 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -77,21 +77,21 @@ void __weak arch_irq_work_raise(void) */ static void __irq_work_queue(struct irq_work *entry) { - struct irq_work **head, *next; + struct irq_work *next; - head = &get_cpu_var(irq_work_list); + preempt_disable(); do { - next = *head; + next = __this_cpu_read(irq_work_list); /* Can assign non-atomic because we keep the flags set. */ entry->next = next_flags(next, IRQ_WORK_FLAGS); - } while (cmpxchg(head, next, entry) != next); + } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next); /* The list was empty, raise self-interrupt to start processing. */ if (!irq_work_next(entry)) arch_irq_work_raise(); - put_cpu_var(irq_work_list); + preempt_enable(); } /* @@ -120,16 +120,16 @@ EXPORT_SYMBOL_GPL(irq_work_queue); */ void irq_work_run(void) { - struct irq_work *list, **head; + struct irq_work *list; - head = &__get_cpu_var(irq_work_list); - if (*head == NULL) + if (this_cpu_read(irq_work_list) == NULL) return; BUG_ON(!in_irq()); BUG_ON(!irqs_disabled()); - list = xchg(head, NULL); + list = this_cpu_xchg(irq_work_list, NULL); + while (list != NULL) { struct irq_work *entry = list; diff --git a/kernel/kexec.c b/kernel/kexec.c index b55045b..ec19b92 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -163,7 +163,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, * just verifies it is an address we can use. * * Since the kernel does everything in page size chunks ensure - * the destination addreses are page aligned. Too many + * the destination addresses are page aligned. Too many * special cases crop of when we don't do this. The most * insidious is getting overlapping destination addresses * simply because addresses are changed to page size diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7663e5d..7798181 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -317,12 +317,12 @@ void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) /* We have preemption disabled.. so it is safe to use __ versions */ static inline void set_kprobe_instance(struct kprobe *kp) { - __get_cpu_var(kprobe_instance) = kp; + __this_cpu_write(kprobe_instance, kp); } static inline void reset_kprobe_instance(void) { - __get_cpu_var(kprobe_instance) = NULL; + __this_cpu_write(kprobe_instance, NULL); } /* @@ -965,7 +965,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) { - struct kprobe *cur = __get_cpu_var(kprobe_instance); + struct kprobe *cur = __this_cpu_read(kprobe_instance); /* * if we faulted "during" the execution of a user specified @@ -980,7 +980,7 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) { - struct kprobe *cur = __get_cpu_var(kprobe_instance); + struct kprobe *cur = __this_cpu_read(kprobe_instance); int ret = 0; if (cur && cur->break_handler) { diff --git a/kernel/kthread.c b/kernel/kthread.c index 5355cfd..c55afba 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), wait_for_completion(&create.done); if (!IS_ERR(create.result)) { - static struct sched_param param = { .sched_priority = 0 }; + static const struct sched_param param = { .sched_priority = 0 }; va_list args; va_start(args, namefmt); diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 17110a4..ee74b35 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -241,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v) seq_puts(m, "Latency Top version : v0.1\n"); for (i = 0; i < MAXLR; i++) { - if (latency_record[i].backtrace[0]) { + struct latency_record *lr = &latency_record[i]; + + if (lr->backtrace[0]) { int q; - seq_printf(m, "%i %lu %lu ", - latency_record[i].count, - latency_record[i].time, - latency_record[i].max); + seq_printf(m, "%i %lu %lu", + lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { - char sym[KSYM_SYMBOL_LEN]; - char *c; - if (!latency_record[i].backtrace[q]) + unsigned long bt = lr->backtrace[q]; + if (!bt) break; - if (latency_record[i].backtrace[q] == ULONG_MAX) + if (bt == ULONG_MAX) break; - sprint_symbol(sym, latency_record[i].backtrace[q]); - c = strchr(sym, '+'); - if (c) - *c = 0; - seq_printf(m, "%s ", sym); + seq_printf(m, " %ps", (void *)bt); } seq_printf(m, "\n"); } diff --git a/kernel/panic.c b/kernel/panic.c index 4c13b1a..991bb87 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,6 +34,7 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); int panic_timeout; +EXPORT_SYMBOL_GPL(panic_timeout); ATOMIC_NOTIFIER_HEAD(panic_notifier_list); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 11847bf..05ebe84 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -38,6 +38,12 @@ #include <asm/irq_regs.h> +enum event_type_t { + EVENT_FLEXIBLE = 0x1, + EVENT_PINNED = 0x2, + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, +}; + atomic_t perf_task_events __read_mostly; static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -65,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000; static atomic64_t perf_event_id; +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, + enum event_type_t event_type); + +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, + enum event_type_t event_type); + void __weak perf_event_print_debug(void) { } extern __weak const char *perf_pmu_name(void) @@ -72,6 +84,11 @@ extern __weak const char *perf_pmu_name(void) return "pmu"; } +static inline u64 perf_clock(void) +{ + return local_clock(); +} + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -240,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx) put_ctx(ctx); } -static inline u64 perf_clock(void) -{ - return local_clock(); -} - /* * Update the record of the current time in a context. */ @@ -256,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx) ctx->timestamp = now; } +static u64 perf_event_time(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + return ctx ? ctx->time : 0; +} + /* * Update the total_time_enabled and total_time_running fields for a event. */ @@ -269,7 +287,7 @@ static void update_event_times(struct perf_event *event) return; if (ctx->is_active) - run_end = ctx->time; + run_end = perf_event_time(event); else run_end = event->tstamp_stopped; @@ -278,7 +296,7 @@ static void update_event_times(struct perf_event *event) if (event->state == PERF_EVENT_STATE_INACTIVE) run_end = event->tstamp_stopped; else - run_end = ctx->time; + run_end = perf_event_time(event); event->total_time_running = run_end - event->tstamp_running; } @@ -534,6 +552,7 @@ event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { + u64 tstamp = perf_event_time(event); u64 delta; /* * An event which could not be activated because of @@ -545,7 +564,7 @@ event_sched_out(struct perf_event *event, && !event_filter_match(event)) { delta = ctx->time - event->tstamp_stopped; event->tstamp_running += delta; - event->tstamp_stopped = ctx->time; + event->tstamp_stopped = tstamp; } if (event->state != PERF_EVENT_STATE_ACTIVE) @@ -556,7 +575,7 @@ event_sched_out(struct perf_event *event, event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = ctx->time; + event->tstamp_stopped = tstamp; event->pmu->del(event, 0); event->oncpu = -1; @@ -768,6 +787,8 @@ event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { + u64 tstamp = perf_event_time(event); + if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -784,9 +805,9 @@ event_sched_in(struct perf_event *event, return -EAGAIN; } - event->tstamp_running += ctx->time - event->tstamp_stopped; + event->tstamp_running += tstamp - event->tstamp_stopped; - event->shadow_ctx_time = ctx->time - ctx->timestamp; + event->shadow_ctx_time = tstamp - ctx->timestamp; if (!is_software_event(event)) cpuctx->active_oncpu++; @@ -898,11 +919,13 @@ static int group_can_go_on(struct perf_event *event, static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { + u64 tstamp = perf_event_time(event); + list_add_event(event, ctx); perf_group_attach(event); - event->tstamp_enabled = ctx->time; - event->tstamp_running = ctx->time; - event->tstamp_stopped = ctx->time; + event->tstamp_enabled = tstamp; + event->tstamp_running = tstamp; + event->tstamp_stopped = tstamp; } /* @@ -937,7 +960,7 @@ static void __perf_install_in_context(void *info) add_event_to_ctx(event, ctx); - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) goto unlock; /* @@ -1042,14 +1065,13 @@ static void __perf_event_mark_enabled(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *sub; + u64 tstamp = perf_event_time(event); event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = ctx->time - event->total_time_enabled; + event->tstamp_enabled = tstamp - event->total_time_enabled; list_for_each_entry(sub, &event->sibling_list, group_entry) { - if (sub->state >= PERF_EVENT_STATE_INACTIVE) { - sub->tstamp_enabled = - ctx->time - sub->total_time_enabled; - } + if (sub->state >= PERF_EVENT_STATE_INACTIVE) + sub->tstamp_enabled = tstamp - sub->total_time_enabled; } } @@ -1082,7 +1104,7 @@ static void __perf_event_enable(void *info) goto unlock; __perf_event_mark_enabled(event, ctx); - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) goto unlock; /* @@ -1193,12 +1215,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh) return 0; } -enum event_type_t { - EVENT_FLEXIBLE = 0x1, - EVENT_PINNED = 0x2, - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, -}; - static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) @@ -1435,7 +1451,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, list_for_each_entry(event, &ctx->pinned_groups, group_entry) { if (event->state <= PERF_EVENT_STATE_OFF) continue; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) continue; if (group_can_go_on(event, cpuctx, 1)) @@ -1467,7 +1483,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, * Listen to the 'cpu' scheduling filter constraint * of events: */ - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) continue; if (group_can_go_on(event, cpuctx, can_add_hw)) { @@ -1694,7 +1710,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) if (event->state != PERF_EVENT_STATE_ACTIVE) continue; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) continue; hwc = &event->hw; @@ -3893,7 +3909,7 @@ static int perf_event_task_match(struct perf_event *event) if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) return 0; if (event->attr.comm || event->attr.mmap || @@ -4030,7 +4046,7 @@ static int perf_event_comm_match(struct perf_event *event) if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) return 0; if (event->attr.comm) @@ -4178,7 +4194,7 @@ static int perf_event_mmap_match(struct perf_event *event, if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) return 0; if ((!executable && event->attr.mmap_data) || @@ -4648,7 +4664,7 @@ int perf_swevent_get_recursion_context(void) } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); -void inline perf_swevent_put_recursion_context(int rctx) +inline void perf_swevent_put_recursion_context(int rctx) { struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a5aff3e..2657299 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG depends on PM_ADVANCED_DEBUG default n -config SUSPEND_NVS - bool - config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE - select SUSPEND_NVS if HAS_IOMEM default y ---help--- Allow the system to enter sleep states in which main memory is @@ -140,7 +136,6 @@ config HIBERNATION depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE select LZO_COMPRESS select LZO_DECOMPRESS - select SUSPEND_NVS if HAS_IOMEM ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the diff --git a/kernel/power/Makefile b/kernel/power/Makefile index f9063c6..c350e18 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,7 +1,4 @@ - -ifeq ($(CONFIG_PM_DEBUG),y) -EXTRA_CFLAGS += -DDEBUG -endif +ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG obj-$(CONFIG_PM) += main.o obj-$(CONFIG_PM_SLEEP) += console.o @@ -10,6 +7,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o -obj-$(CONFIG_SUSPEND_NVS) += nvs.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 048d0b5..1832bd2 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -51,18 +51,18 @@ enum { static int hibernation_mode = HIBERNATION_SHUTDOWN; -static struct platform_hibernation_ops *hibernation_ops; +static const struct platform_hibernation_ops *hibernation_ops; /** * hibernation_set_ops - set the global hibernate operations * @ops: the hibernation operations to use in subsequent hibernation transitions */ -void hibernation_set_ops(struct platform_hibernation_ops *ops) +void hibernation_set_ops(const struct platform_hibernation_ops *ops) { if (ops && !(ops->begin && ops->end && ops->pre_snapshot && ops->prepare && ops->finish && ops->enter && ops->pre_restore - && ops->restore_cleanup)) { + && ops->restore_cleanup && ops->leave)) { WARN_ON(1); return; } @@ -278,7 +278,7 @@ static int create_image(int platform_mode) goto Enable_irqs; } - if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) + if (hibernation_test(TEST_CORE) || pm_wakeup_pending()) goto Power_up; in_suspend = 1; @@ -516,7 +516,7 @@ int hibernation_platform_enter(void) local_irq_disable(); sysdev_suspend(PMSG_HIBERNATE); - if (!pm_check_wakeup_events()) { + if (pm_wakeup_pending()) { error = -EAGAIN; goto Power_up; } @@ -647,6 +647,7 @@ int hibernate(void) swsusp_free(); if (!error) power_down(); + in_suspend = 0; pm_restore_gfp_mask(); } else { pr_debug("PM: Image restored successfully.\n"); diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c deleted file mode 100644 index 1836db6..0000000 --- a/kernel/power/nvs.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory - * - * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. - * - * This file is released under the GPLv2. - */ - -#include <linux/io.h> -#include <linux/kernel.h> -#include <linux/list.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/suspend.h> - -/* - * Platforms, like ACPI, may want us to save some memory used by them during - * suspend and to restore the contents of this memory during the subsequent - * resume. The code below implements a mechanism allowing us to do that. - */ - -struct nvs_page { - unsigned long phys_start; - unsigned int size; - void *kaddr; - void *data; - struct list_head node; -}; - -static LIST_HEAD(nvs_list); - -/** - * suspend_nvs_register - register platform NVS memory region to save - * @start - physical address of the region - * @size - size of the region - * - * The NVS region need not be page-aligned (both ends) and we arrange - * things so that the data from page-aligned addresses in this region will - * be copied into separate RAM pages. - */ -int suspend_nvs_register(unsigned long start, unsigned long size) -{ - struct nvs_page *entry, *next; - - while (size > 0) { - unsigned int nr_bytes; - - entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); - if (!entry) - goto Error; - - list_add_tail(&entry->node, &nvs_list); - entry->phys_start = start; - nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); - entry->size = (size < nr_bytes) ? size : nr_bytes; - - start += entry->size; - size -= entry->size; - } - return 0; - - Error: - list_for_each_entry_safe(entry, next, &nvs_list, node) { - list_del(&entry->node); - kfree(entry); - } - return -ENOMEM; -} - -/** - * suspend_nvs_free - free data pages allocated for saving NVS regions - */ -void suspend_nvs_free(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - free_page((unsigned long)entry->data); - entry->data = NULL; - if (entry->kaddr) { - iounmap(entry->kaddr); - entry->kaddr = NULL; - } - } -} - -/** - * suspend_nvs_alloc - allocate memory necessary for saving NVS regions - */ -int suspend_nvs_alloc(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) { - entry->data = (void *)__get_free_page(GFP_KERNEL); - if (!entry->data) { - suspend_nvs_free(); - return -ENOMEM; - } - } - return 0; -} - -/** - * suspend_nvs_save - save NVS memory regions - */ -void suspend_nvs_save(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Saving platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - entry->kaddr = ioremap(entry->phys_start, entry->size); - memcpy(entry->data, entry->kaddr, entry->size); - } -} - -/** - * suspend_nvs_restore - restore NVS memory regions - * - * This function is going to be called with interrupts disabled, so it - * cannot iounmap the virtual addresses used to access the NVS region. - */ -void suspend_nvs_restore(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Restoring platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) - memcpy(entry->kaddr, entry->data, entry->size); -} diff --git a/kernel/power/process.c b/kernel/power/process.c index e50b4c1..d6d2a10 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only) * perturb a task in TASK_STOPPED or TASK_TRACED. * It is "frozen enough". If the task does wake * up, it will immediately call try_to_freeze. + * + * Because freeze_task() goes through p's + * scheduler lock after setting TIF_FREEZE, it's + * guaranteed that either we see TASK_RUNNING or + * try_to_stop() after schedule() in ptrace/signal + * stop sees TIF_FREEZE. */ if (!task_is_stopped_or_traced(p) && !freezer_should_skip(p)) @@ -79,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only) if (!todo || time_after(jiffies, end_time)) break; - if (!pm_check_wakeup_events()) { + if (pm_wakeup_pending()) { wakeup = true; break; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 031d5e3..de6f86b 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -31,13 +31,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_MEM] = "mem", }; -static struct platform_suspend_ops *suspend_ops; +static const struct platform_suspend_ops *suspend_ops; /** * suspend_set_ops - Set the global suspend method table. * @ops: Pointer to ops structure. */ -void suspend_set_ops(struct platform_suspend_ops *ops) +void suspend_set_ops(const struct platform_suspend_ops *ops) { mutex_lock(&pm_mutex); suspend_ops = ops; @@ -164,7 +164,7 @@ static int suspend_enter(suspend_state_t state) error = sysdev_suspend(PMSG_SUSPEND); if (!error) { - if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { + if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { error = suspend_ops->enter(state); events_check_enabled = false; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8c7e483..7c97c3a 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -224,7 +224,7 @@ static int swsusp_swap_check(void) return res; root_swap = res; - res = blkdev_get(hib_resume_bdev, FMODE_WRITE); + res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); if (res) return res; @@ -888,7 +888,7 @@ out_finish: /** * swsusp_read - read the hibernation image. * @flags_p: flags passed by the "frozen" kernel in the image header should - * be written into this memeory location + * be written into this memory location */ int swsusp_read(unsigned int *flags_p) @@ -930,7 +930,8 @@ int swsusp_check(void) { int error; - hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, + FMODE_READ, NULL); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); diff --git a/kernel/printk.c b/kernel/printk.c index ab3ffc5..53d9a9e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -39,16 +39,11 @@ #include <linux/syslog.h> #include <linux/cpu.h> #include <linux/notifier.h> +#include <linux/rculist.h> #include <asm/uaccess.h> /* - * for_each_console() allows you to iterate on each console - */ -#define for_each_console(con) \ - for (con = console_drivers; con != NULL; con = con->next) - -/* * Architectures can override it: */ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) @@ -279,12 +274,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) * at open time. */ if (type == SYSLOG_ACTION_OPEN || !from_file) { - if (dmesg_restrict && !capable(CAP_SYS_ADMIN)) - return -EPERM; + if (dmesg_restrict && !capable(CAP_SYSLOG)) + goto warn; /* switch to return -EPERM after 2.6.39 */ if ((type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; + !capable(CAP_SYSLOG)) + goto warn; /* switch to return -EPERM after 2.6.39 */ } error = security_syslog(type); @@ -428,6 +423,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) } out: return error; +warn: + /* remove after 2.6.39 */ + if (capable(CAP_SYS_ADMIN)) + WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " + "but no CAP_SYSLOG (deprecated and denied).\n"); + return -EPERM; } SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) @@ -1359,6 +1360,7 @@ void register_console(struct console *newcon) spin_unlock_irqrestore(&logbuf_lock, flags); } release_console_sem(); + console_sysfs_notify(); /* * By unregistering the bootconsoles after we enable the real console @@ -1417,6 +1419,7 @@ int unregister_console(struct console *console) console_drivers->flags |= CON_CONSDEV; release_console_sem(); + console_sysfs_notify(); return res; } EXPORT_SYMBOL(unregister_console); @@ -1500,7 +1503,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper) /* Don't allow registering multiple times */ if (!dumper->registered) { dumper->registered = 1; - list_add_tail(&dumper->list, &dump_list); + list_add_tail_rcu(&dumper->list, &dump_list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); @@ -1524,29 +1527,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) spin_lock_irqsave(&dump_list_lock, flags); if (dumper->registered) { dumper->registered = 0; - list_del(&dumper->list); + list_del_rcu(&dumper->list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); + synchronize_rcu(); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_unregister); -static const char * const kmsg_reasons[] = { - [KMSG_DUMP_OOPS] = "oops", - [KMSG_DUMP_PANIC] = "panic", - [KMSG_DUMP_KEXEC] = "kexec", -}; - -static const char *kmsg_to_str(enum kmsg_dump_reason reason) -{ - if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0) - return "unknown"; - - return kmsg_reasons[reason]; -} - /** * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping @@ -1585,13 +1575,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) l2 = chars; } - if (!spin_trylock_irqsave(&dump_list_lock, flags)) { - printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", - kmsg_to_str(reason)); - return; - } - list_for_each_entry(dumper, &dump_list, list) + rcu_read_lock(); + list_for_each_entry_rcu(dumper, &dump_list, list) dumper->dump(dumper, reason, s1, l1, s2, l2); - spin_unlock_irqrestore(&dump_list_lock, flags); + rcu_read_unlock(); } #endif diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 0344937..0c343b9 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -189,7 +189,8 @@ static int rcu_kthread(void *arg) unsigned long flags; for (;;) { - wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0); + wait_event_interruptible(rcu_kthread_wq, + have_rcu_kthread_work != 0); morework = rcu_boost(); local_irq_save(flags); work = have_rcu_kthread_work; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d0ddfea..dd4aea8 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -364,8 +364,8 @@ void rcu_irq_exit(void) WARN_ON_ONCE(rdtp->dynticks & 0x1); /* If the interrupt queued a callback, get out of dyntick mode. */ - if (__get_cpu_var(rcu_sched_data).nxtlist || - __get_cpu_var(rcu_bh_data).nxtlist) + if (__this_cpu_read(rcu_sched_data.nxtlist) || + __this_cpu_read(rcu_bh_data.nxtlist)) set_need_resched(); } diff --git a/kernel/sched.c b/kernel/sched.c index 0494908..ea3e5ef 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -278,14 +278,12 @@ struct task_group { #endif }; -#define root_task_group init_task_group - /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); #ifdef CONFIG_FAIR_GROUP_SCHED -# define INIT_TASK_GROUP_LOAD NICE_0_LOAD +# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD /* * A weight of 0 or 1 can cause arithmetics problems. @@ -298,13 +296,13 @@ static DEFINE_SPINLOCK(task_group_lock); #define MIN_SHARES 2 #define MAX_SHARES (1UL << 18) -static int init_task_group_load = INIT_TASK_GROUP_LOAD; +static int root_task_group_load = ROOT_TASK_GROUP_LOAD; #endif /* Default task group. * Every task in system belong to this group at bootup. */ -struct task_group init_task_group; +struct task_group root_task_group; #endif /* CONFIG_CGROUP_SCHED */ @@ -743,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; cmp = strstrip(buf); - if (strncmp(buf, "NO_", 3) == 0) { + if (strncmp(cmp, "NO_", 3) == 0) { neg = 1; cmp += 3; } @@ -2507,7 +2505,7 @@ out: * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened * - * Put @p on the run-queue if it's not alredy there. The caller must + * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not * the current task. this_rq() stays locked over invocation. */ @@ -7848,7 +7846,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, cfs_rq->tg = tg; tg->se[cpu] = se; - /* se could be NULL for init_task_group */ + /* se could be NULL for root_task_group */ if (!se) return; @@ -7908,18 +7906,18 @@ void __init sched_init(void) ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); #ifdef CONFIG_FAIR_GROUP_SCHED - init_task_group.se = (struct sched_entity **)ptr; + root_task_group.se = (struct sched_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); - init_task_group.cfs_rq = (struct cfs_rq **)ptr; + root_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED - init_task_group.rt_se = (struct sched_rt_entity **)ptr; + root_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); - init_task_group.rt_rq = (struct rt_rq **)ptr; + root_task_group.rt_rq = (struct rt_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_RT_GROUP_SCHED */ @@ -7939,13 +7937,13 @@ void __init sched_init(void) global_rt_period(), global_rt_runtime()); #ifdef CONFIG_RT_GROUP_SCHED - init_rt_bandwidth(&init_task_group.rt_bandwidth, + init_rt_bandwidth(&root_task_group.rt_bandwidth, global_rt_period(), global_rt_runtime()); #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED - list_add(&init_task_group.list, &task_groups); - INIT_LIST_HEAD(&init_task_group.children); + list_add(&root_task_group.list, &task_groups); + INIT_LIST_HEAD(&root_task_group.children); autogroup_init(&init_task); #endif /* CONFIG_CGROUP_SCHED */ @@ -7960,34 +7958,34 @@ void __init sched_init(void) init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED - init_task_group.shares = init_task_group_load; + root_task_group.shares = root_task_group_load; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); /* - * How much cpu bandwidth does init_task_group get? + * How much cpu bandwidth does root_task_group get? * * In case of task-groups formed thr' the cgroup filesystem, it * gets 100% of the cpu resources in the system. This overall * system cpu resource is divided among the tasks of - * init_task_group and its child task-groups in a fair manner, + * root_task_group and its child task-groups in a fair manner, * based on each entity's (task or task-group's) weight * (se->load.weight). * - * In other words, if init_task_group has 10 tasks of weight + * In other words, if root_task_group has 10 tasks of weight * 1024) and two child groups A0 and A1 (of weight 1024 each), * then A0's share of the cpu resource is: * * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% * - * We achieve this by letting init_task_group's tasks sit - * directly in rq->cfs (i.e init_task_group->se[] = NULL). + * We achieve this by letting root_task_group's tasks sit + * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ - init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL); + init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_rt_rq_list); - init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL); + init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif for (j = 0; j < CPU_LOAD_IDX_MAX; j++) @@ -8379,6 +8377,7 @@ static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); free_rt_sched_group(tg); + autogroup_free(tg); kfree(tg); } @@ -8812,7 +8811,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (!cgrp->parent) { /* This is early initialization for the top cgroup */ - return &init_task_group.css; + return &root_task_group.css; } parent = cgroup_tg(cgrp->parent); diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index c80fedc..32a723b 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -9,10 +9,10 @@ unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; -static void autogroup_init(struct task_struct *init_task) +static void __init autogroup_init(struct task_struct *init_task) { - autogroup_default.tg = &init_task_group; - init_task_group.autogroup = &autogroup_default; + autogroup_default.tg = &root_task_group; + root_task_group.autogroup = &autogroup_default; kref_init(&autogroup_default.kref); init_rwsem(&autogroup_default.lock); init_task->signal->autogroup = &autogroup_default; @@ -63,7 +63,7 @@ static inline struct autogroup *autogroup_create(void) if (!ag) goto out_fail; - tg = sched_create_group(&init_task_group); + tg = sched_create_group(&root_task_group); if (IS_ERR(tg)) goto out_free; diff --git a/kernel/smp.c b/kernel/smp.c index 12ed8b0..4ec30e0 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -13,6 +13,7 @@ #include <linux/smp.h> #include <linux/cpu.h> +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS static struct { struct list_head queue; raw_spinlock_t lock; @@ -529,3 +530,21 @@ void ipi_call_unlock_irq(void) { raw_spin_unlock_irq(&call_function.lock); } +#endif /* USE_GENERIC_SMP_HELPERS */ + +/* + * Call a function on all processors + */ +int on_each_cpu(void (*func) (void *info), void *info, int wait) +{ + int ret = 0; + + preempt_disable(); + ret = smp_call_function(func, info, wait); + local_irq_disable(); + func(info); + local_irq_enable(); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL(on_each_cpu); diff --git a/kernel/softirq.c b/kernel/softirq.c index d4d918a..68eb5ef 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -70,7 +70,7 @@ char *softirq_to_name[NR_SOFTIRQS] = { static void wakeup_softirqd(void) { /* Interrupts are disabled: no need to stop preemption */ - struct task_struct *tsk = __get_cpu_var(ksoftirqd); + struct task_struct *tsk = __this_cpu_read(ksoftirqd); if (tsk && tsk->state != TASK_RUNNING) wake_up_process(tsk); @@ -388,8 +388,8 @@ void __tasklet_schedule(struct tasklet_struct *t) local_irq_save(flags); t->next = NULL; - *__get_cpu_var(tasklet_vec).tail = t; - __get_cpu_var(tasklet_vec).tail = &(t->next); + *__this_cpu_read(tasklet_vec.tail) = t; + __this_cpu_write(tasklet_vec.tail, &(t->next)); raise_softirq_irqoff(TASKLET_SOFTIRQ); local_irq_restore(flags); } @@ -402,8 +402,8 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) local_irq_save(flags); t->next = NULL; - *__get_cpu_var(tasklet_hi_vec).tail = t; - __get_cpu_var(tasklet_hi_vec).tail = &(t->next); + *__this_cpu_read(tasklet_hi_vec.tail) = t; + __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); raise_softirq_irqoff(HI_SOFTIRQ); local_irq_restore(flags); } @@ -414,8 +414,8 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t) { BUG_ON(!irqs_disabled()); - t->next = __get_cpu_var(tasklet_hi_vec).head; - __get_cpu_var(tasklet_hi_vec).head = t; + t->next = __this_cpu_read(tasklet_hi_vec.head); + __this_cpu_write(tasklet_hi_vec.head, t); __raise_softirq_irqoff(HI_SOFTIRQ); } @@ -426,9 +426,9 @@ static void tasklet_action(struct softirq_action *a) struct tasklet_struct *list; local_irq_disable(); - list = __get_cpu_var(tasklet_vec).head; - __get_cpu_var(tasklet_vec).head = NULL; - __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; + list = __this_cpu_read(tasklet_vec.head); + __this_cpu_write(tasklet_vec.head, NULL); + __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); local_irq_enable(); while (list) { @@ -449,8 +449,8 @@ static void tasklet_action(struct softirq_action *a) local_irq_disable(); t->next = NULL; - *__get_cpu_var(tasklet_vec).tail = t; - __get_cpu_var(tasklet_vec).tail = &(t->next); + *__this_cpu_read(tasklet_vec.tail) = t; + __this_cpu_write(tasklet_vec.tail, &(t->next)); __raise_softirq_irqoff(TASKLET_SOFTIRQ); local_irq_enable(); } @@ -461,9 +461,9 @@ static void tasklet_hi_action(struct softirq_action *a) struct tasklet_struct *list; local_irq_disable(); - list = __get_cpu_var(tasklet_hi_vec).head; - __get_cpu_var(tasklet_hi_vec).head = NULL; - __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; + list = __this_cpu_read(tasklet_hi_vec.head); + __this_cpu_write(tasklet_hi_vec.head, NULL); + __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); local_irq_enable(); while (list) { @@ -484,8 +484,8 @@ static void tasklet_hi_action(struct softirq_action *a) local_irq_disable(); t->next = NULL; - *__get_cpu_var(tasklet_hi_vec).tail = t; - __get_cpu_var(tasklet_hi_vec).tail = &(t->next); + *__this_cpu_read(tasklet_hi_vec.tail) = t; + __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); __raise_softirq_irqoff(HI_SOFTIRQ); local_irq_enable(); } @@ -802,16 +802,16 @@ static void takeover_tasklets(unsigned int cpu) /* Find end, append list for that CPU. */ if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { - *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head; - __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; + *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; + this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); per_cpu(tasklet_vec, cpu).head = NULL; per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; } raise_softirq_irqoff(TASKLET_SOFTIRQ); if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { - *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; - __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; + *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head; + __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail); per_cpu(tasklet_hi_vec, cpu).head = NULL; per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; } @@ -853,7 +853,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: { - static struct sched_param param = { + static const struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; @@ -885,25 +885,6 @@ static __init int spawn_ksoftirqd(void) } early_initcall(spawn_ksoftirqd); -#ifdef CONFIG_SMP -/* - * Call a function on all processors - */ -int on_each_cpu(void (*func) (void *info), void *info, int wait) -{ - int ret = 0; - - preempt_disable(); - ret = smp_call_function(func, info, wait); - local_irq_disable(); - func(info); - local_irq_enable(); - preempt_enable(); - return ret; -} -EXPORT_SYMBOL(on_each_cpu); -#endif - /* * [ These __weak aliases are kept in a separate compilation unit, so that * GCC does not inline them incorrectly. ] diff --git a/kernel/srcu.c b/kernel/srcu.c index 98d8c1e..73ce23f 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -156,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx) EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* + * We use an adaptive strategy for synchronize_srcu() and especially for + * synchronize_srcu_expedited(). We spin for a fixed time period + * (defined below) to allow SRCU readers to exit their read-side critical + * sections. If there are still some readers after 10 microseconds, + * we repeatedly block for 1-millisecond time periods. This approach + * has done well in testing, so there is no need for a config parameter. + */ +#define SYNCHRONIZE_SRCU_READER_DELAY 10 + +/* * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). */ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) @@ -207,11 +217,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) * will have finished executing. We initially give readers * an arbitrarily chosen 10 microseconds to get out of their * SRCU read-side critical sections, then loop waiting 1/HZ - * seconds per iteration. + * seconds per iteration. The 10-microsecond value has done + * very well in testing. */ if (srcu_readers_active_idx(sp, idx)) - udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY); + udelay(SYNCHRONIZE_SRCU_READER_DELAY); while (srcu_readers_active_idx(sp, idx)) schedule_timeout_interruptible(1); diff --git a/kernel/sys.c b/kernel/sys.c index 2745dcd..31b71a2 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -43,6 +43,8 @@ #include <linux/kprobes.h> #include <linux/user_namespace.h> +#include <linux/kmsg_dump.h> + #include <asm/uaccess.h> #include <asm/io.h> #include <asm/unistd.h> @@ -285,6 +287,7 @@ out_unlock: */ void emergency_restart(void) { + kmsg_dump(KMSG_DUMP_EMERG); machine_emergency_restart(); } EXPORT_SYMBOL_GPL(emergency_restart); @@ -312,6 +315,7 @@ void kernel_restart(char *cmd) printk(KERN_EMERG "Restarting system.\n"); else printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + kmsg_dump(KMSG_DUMP_RESTART); machine_restart(cmd); } EXPORT_SYMBOL_GPL(kernel_restart); @@ -333,6 +337,7 @@ void kernel_halt(void) kernel_shutdown_prepare(SYSTEM_HALT); sysdev_shutdown(); printk(KERN_EMERG "System halted.\n"); + kmsg_dump(KMSG_DUMP_HALT); machine_halt(); } @@ -351,6 +356,7 @@ void kernel_power_off(void) disable_nonboot_cpus(); sysdev_shutdown(); printk(KERN_EMERG "Power down.\n"); + kmsg_dump(KMSG_DUMP_POWEROFF); machine_power_off(); } EXPORT_SYMBOL_GPL(kernel_power_off); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ae5cbb1..bc86bb3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -24,6 +24,7 @@ #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/signal.h> +#include <linux/printk.h> #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/ctype.h> @@ -245,10 +246,6 @@ static struct ctl_table root_table[] = { .mode = 0555, .child = dev_table, }, -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ { } }; @@ -710,6 +707,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "kptr_restrict", + .data = &kptr_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, #endif { .procname = "ngroups_max", @@ -962,10 +968,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ { } }; @@ -1326,11 +1328,6 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif - -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ { } }; @@ -1486,10 +1483,6 @@ static struct ctl_table fs_table[] = { .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ { } }; @@ -2899,7 +2892,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } } -#else /* CONFIG_PROC_FS */ +#else /* CONFIG_PROC_SYSCTL */ int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2951,7 +2944,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, } -#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_PROC_SYSCTL */ /* * No sense putting this after each symbol definition, twice, diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 4b2545a..b875bed 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1192,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file, buf[result] = '\0'; - /* Convert the decnet addresss to binary */ + /* Convert the decnet address to binary */ result = -EIO; nodep = strchr(buf, '.') + 1; if (!nodep) diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 3308fd7..3971c6b 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -89,8 +89,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, return -ENOMEM; if (!info) { - int seq = get_cpu_var(taskstats_seqnum)++; - put_cpu_var(taskstats_seqnum); + int seq = this_cpu_inc_return(taskstats_seqnum) - 1; reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); } else @@ -349,7 +348,7 @@ static int parse(struct nlattr *na, struct cpumask *mask) return ret; } -#ifdef CONFIG_IA64 +#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define TASKSTATS_NEEDS_PADDING 1 #endif @@ -612,7 +611,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) fill_tgid_exit(tsk); } - listeners = &__raw_get_cpu_var(listener_array); + listeners = __this_cpu_ptr(&listener_array); if (list_empty(&listeners->list)) return; diff --git a/kernel/time.c b/kernel/time.c index ba9b338..3217435 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time); * Avoid unnecessary multiplications/divisions in the * two most common HZ cases: */ -unsigned int inline jiffies_to_msecs(const unsigned long j) +inline unsigned int jiffies_to_msecs(const unsigned long j) { #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) return (MSEC_PER_SEC / HZ) * j; @@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j) } EXPORT_SYMBOL(jiffies_to_msecs); -unsigned int inline jiffies_to_usecs(const unsigned long j) +inline unsigned int jiffies_to_usecs(const unsigned long j) { #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) return (USEC_PER_SEC / HZ) * j; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c18d7ef..6519cf62 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * @shift: pointer to shift variable * @from: frequency to convert from * @to: frequency to convert to - * @minsec: guaranteed runtime conversion range in seconds + * @maxsec: guaranteed runtime conversion range in seconds * * The function evaluates the shift/mult pair for the scaled math * operations of clocksources and clockevents. @@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock * event @to is the counter frequency and @from is NSEC_PER_SEC. * - * The @minsec conversion range argument controls the time frame in + * The @maxsec conversion range argument controls the time frame in * seconds which must be covered by the runtime conversion with the * calculated mult and shift factors. This guarantees that no 64bit * overflow happens when the input value of the conversion is @@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * factors. */ void -clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) +clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) { u64 tmp; u32 sft, sftacc= 32; @@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) * Calculate the shift factor which is limiting the conversion * range: */ - tmp = ((u64)minsec * from) >> 32; + tmp = ((u64)maxsec * from) >> 32; while (tmp) { tmp >>=1; sftacc--; @@ -152,6 +152,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) */ for (sft = 32; sft > 0; sft--) { tmp = (u64) to << sft; + tmp += from / 2; do_div(tmp, from); if ((tmp >> sftacc) == 0) break; @@ -678,7 +679,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) { - /* Intialize mult/shift and max_idle_ns */ + /* Initialize mult/shift and max_idle_ns */ __clocksource_updatefreq_scale(cs, scale, freq); /* Add clocksource to the clcoksource list */ diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index d232189..5c00242 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -14,6 +14,7 @@ #include <linux/timex.h> #include <linux/time.h> #include <linux/mm.h> +#include <linux/module.h> /* * NTP timekeeping variables: @@ -74,6 +75,162 @@ static long time_adjust; /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ static s64 ntp_tick_adj; +#ifdef CONFIG_NTP_PPS + +/* + * The following variables are used when a pulse-per-second (PPS) signal + * is available. They establish the engineering parameters of the clock + * discipline loop when controlled by the PPS signal. + */ +#define PPS_VALID 10 /* PPS signal watchdog max (s) */ +#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */ +#define PPS_INTMIN 2 /* min freq interval (s) (shift) */ +#define PPS_INTMAX 8 /* max freq interval (s) (shift) */ +#define PPS_INTCOUNT 4 /* number of consecutive good intervals to + increase pps_shift or consecutive bad + intervals to decrease it */ +#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ + +static int pps_valid; /* signal watchdog counter */ +static long pps_tf[3]; /* phase median filter */ +static long pps_jitter; /* current jitter (ns) */ +static struct timespec pps_fbase; /* beginning of the last freq interval */ +static int pps_shift; /* current interval duration (s) (shift) */ +static int pps_intcnt; /* interval counter */ +static s64 pps_freq; /* frequency offset (scaled ns/s) */ +static long pps_stabil; /* current stability (scaled ns/s) */ + +/* + * PPS signal quality monitors + */ +static long pps_calcnt; /* calibration intervals */ +static long pps_jitcnt; /* jitter limit exceeded */ +static long pps_stbcnt; /* stability limit exceeded */ +static long pps_errcnt; /* calibration errors */ + + +/* PPS kernel consumer compensates the whole phase error immediately. + * Otherwise, reduce the offset by a fixed factor times the time constant. + */ +static inline s64 ntp_offset_chunk(s64 offset) +{ + if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) + return offset; + else + return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) +{ + /* the PPS calibration interval may end + surprisingly early */ + pps_shift = PPS_INTMIN; + pps_intcnt = 0; +} + +/** + * pps_clear - Clears the PPS state variables + * + * Must be called while holding a write on the xtime_lock + */ +static inline void pps_clear(void) +{ + pps_reset_freq_interval(); + pps_tf[0] = 0; + pps_tf[1] = 0; + pps_tf[2] = 0; + pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; + pps_freq = 0; +} + +/* Decrease pps_valid to indicate that another second has passed since + * the last PPS signal. When it reaches 0, indicate that PPS signal is + * missing. + * + * Must be called while holding a write on the xtime_lock + */ +static inline void pps_dec_valid(void) +{ + if (pps_valid > 0) + pps_valid--; + else { + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + pps_clear(); + } +} + +static inline void pps_set_freq(s64 freq) +{ + pps_freq = freq; +} + +static inline int is_error_status(int status) +{ + return (time_status & (STA_UNSYNC|STA_CLOCKERR)) + /* PPS signal lost when either PPS time or + * PPS frequency synchronization requested + */ + || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) + && !(time_status & STA_PPSSIGNAL)) + /* PPS jitter exceeded when + * PPS time synchronization requested */ + || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) + == (STA_PPSTIME|STA_PPSJITTER)) + /* PPS wander exceeded or calibration error when + * PPS frequency synchronization requested + */ + || ((time_status & STA_PPSFREQ) + && (time_status & (STA_PPSWANDER|STA_PPSERROR))); +} + +static inline void pps_fill_timex(struct timex *txc) +{ + txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * + PPM_SCALE_INV, NTP_SCALE_SHIFT); + txc->jitter = pps_jitter; + if (!(time_status & STA_NANO)) + txc->jitter /= NSEC_PER_USEC; + txc->shift = pps_shift; + txc->stabil = pps_stabil; + txc->jitcnt = pps_jitcnt; + txc->calcnt = pps_calcnt; + txc->errcnt = pps_errcnt; + txc->stbcnt = pps_stbcnt; +} + +#else /* !CONFIG_NTP_PPS */ + +static inline s64 ntp_offset_chunk(s64 offset) +{ + return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) {} +static inline void pps_clear(void) {} +static inline void pps_dec_valid(void) {} +static inline void pps_set_freq(s64 freq) {} + +static inline int is_error_status(int status) +{ + return status & (STA_UNSYNC|STA_CLOCKERR); +} + +static inline void pps_fill_timex(struct timex *txc) +{ + /* PPS is not implemented, so these are zero */ + txc->ppsfreq = 0; + txc->jitter = 0; + txc->shift = 0; + txc->stabil = 0; + txc->jitcnt = 0; + txc->calcnt = 0; + txc->errcnt = 0; + txc->stbcnt = 0; +} + +#endif /* CONFIG_NTP_PPS */ + /* * NTP methods: */ @@ -185,6 +342,9 @@ void ntp_clear(void) tick_length = tick_length_base; time_offset = 0; + + /* Clear PPS state variables */ + pps_clear(); } /* @@ -250,16 +410,16 @@ void second_overflow(void) time_status |= STA_UNSYNC; } - /* - * Compute the phase adjustment for the next second. The offset is - * reduced by a fixed factor times the time constant. - */ + /* Compute the phase adjustment for the next second */ tick_length = tick_length_base; - delta = shift_right(time_offset, SHIFT_PLL + time_constant); + delta = ntp_offset_chunk(time_offset); time_offset -= delta; tick_length += delta; + /* Check PPS signal */ + pps_dec_valid(); + if (!time_adjust) return; @@ -369,6 +529,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; time_status = STA_UNSYNC; + /* restart PPS frequency calibration */ + pps_reset_freq_interval(); } /* @@ -418,6 +580,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts time_freq = txc->freq * PPM_SCALE; time_freq = min(time_freq, MAXFREQ_SCALED); time_freq = max(time_freq, -MAXFREQ_SCALED); + /* update pps_freq */ + pps_set_freq(time_freq); } if (txc->modes & ADJ_MAXERROR) @@ -508,7 +672,8 @@ int do_adjtimex(struct timex *txc) } result = time_state; /* mostly `TIME_OK' */ - if (time_status & (STA_UNSYNC|STA_CLOCKERR)) + /* check for errors */ + if (is_error_status(time_status)) result = TIME_ERROR; txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * @@ -522,15 +687,8 @@ int do_adjtimex(struct timex *txc) txc->tick = tick_usec; txc->tai = time_tai; - /* PPS is not implemented, so these are zero */ - txc->ppsfreq = 0; - txc->jitter = 0; - txc->shift = 0; - txc->stabil = 0; - txc->jitcnt = 0; - txc->calcnt = 0; - txc->errcnt = 0; - txc->stbcnt = 0; + /* fill PPS status fields */ + pps_fill_timex(txc); write_sequnlock_irq(&xtime_lock); @@ -544,6 +702,243 @@ int do_adjtimex(struct timex *txc) return result; } +#ifdef CONFIG_NTP_PPS + +/* actually struct pps_normtime is good old struct timespec, but it is + * semantically different (and it is the reason why it was invented): + * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] + * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ +struct pps_normtime { + __kernel_time_t sec; /* seconds */ + long nsec; /* nanoseconds */ +}; + +/* normalize the timestamp so that nsec is in the + ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ +static inline struct pps_normtime pps_normalize_ts(struct timespec ts) +{ + struct pps_normtime norm = { + .sec = ts.tv_sec, + .nsec = ts.tv_nsec + }; + + if (norm.nsec > (NSEC_PER_SEC >> 1)) { + norm.nsec -= NSEC_PER_SEC; + norm.sec++; + } + + return norm; +} + +/* get current phase correction and jitter */ +static inline long pps_phase_filter_get(long *jitter) +{ + *jitter = pps_tf[0] - pps_tf[1]; + if (*jitter < 0) + *jitter = -*jitter; + + /* TODO: test various filters */ + return pps_tf[0]; +} + +/* add the sample to the phase filter */ +static inline void pps_phase_filter_add(long err) +{ + pps_tf[2] = pps_tf[1]; + pps_tf[1] = pps_tf[0]; + pps_tf[0] = err; +} + +/* decrease frequency calibration interval length. + * It is halved after four consecutive unstable intervals. + */ +static inline void pps_dec_freq_interval(void) +{ + if (--pps_intcnt <= -PPS_INTCOUNT) { + pps_intcnt = -PPS_INTCOUNT; + if (pps_shift > PPS_INTMIN) { + pps_shift--; + pps_intcnt = 0; + } + } +} + +/* increase frequency calibration interval length. + * It is doubled after four consecutive stable intervals. + */ +static inline void pps_inc_freq_interval(void) +{ + if (++pps_intcnt >= PPS_INTCOUNT) { + pps_intcnt = PPS_INTCOUNT; + if (pps_shift < PPS_INTMAX) { + pps_shift++; + pps_intcnt = 0; + } + } +} + +/* update clock frequency based on MONOTONIC_RAW clock PPS signal + * timestamps + * + * At the end of the calibration interval the difference between the + * first and last MONOTONIC_RAW clock timestamps divided by the length + * of the interval becomes the frequency update. If the interval was + * too long, the data are discarded. + * Returns the difference between old and new frequency values. + */ +static long hardpps_update_freq(struct pps_normtime freq_norm) +{ + long delta, delta_mod; + s64 ftemp; + + /* check if the frequency interval was too long */ + if (freq_norm.sec > (2 << pps_shift)) { + time_status |= STA_PPSERROR; + pps_errcnt++; + pps_dec_freq_interval(); + pr_err("hardpps: PPSERROR: interval too long - %ld s\n", + freq_norm.sec); + return 0; + } + + /* here the raw frequency offset and wander (stability) is + * calculated. If the wander is less than the wander threshold + * the interval is increased; otherwise it is decreased. + */ + ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, + freq_norm.sec); + delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); + pps_freq = ftemp; + if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { + pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); + time_status |= STA_PPSWANDER; + pps_stbcnt++; + pps_dec_freq_interval(); + } else { /* good sample */ + pps_inc_freq_interval(); + } + + /* the stability metric is calculated as the average of recent + * frequency changes, but is used only for performance + * monitoring + */ + delta_mod = delta; + if (delta_mod < 0) + delta_mod = -delta_mod; + pps_stabil += (div_s64(((s64)delta_mod) << + (NTP_SCALE_SHIFT - SHIFT_USEC), + NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; + + /* if enabled, the system clock frequency is updated */ + if ((time_status & STA_PPSFREQ) != 0 && + (time_status & STA_FREQHOLD) == 0) { + time_freq = pps_freq; + ntp_update_frequency(); + } + + return delta; +} + +/* correct REALTIME clock phase error against PPS signal */ +static void hardpps_update_phase(long error) +{ + long correction = -error; + long jitter; + + /* add the sample to the median filter */ + pps_phase_filter_add(correction); + correction = pps_phase_filter_get(&jitter); + + /* Nominal jitter is due to PPS signal noise. If it exceeds the + * threshold, the sample is discarded; otherwise, if so enabled, + * the time offset is updated. + */ + if (jitter > (pps_jitter << PPS_POPCORN)) { + pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (pps_jitter << PPS_POPCORN)); + time_status |= STA_PPSJITTER; + pps_jitcnt++; + } else if (time_status & STA_PPSTIME) { + /* correct the time using the phase offset */ + time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, + NTP_INTERVAL_FREQ); + /* cancel running adjtime() */ + time_adjust = 0; + } + /* update jitter */ + pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; +} + +/* + * hardpps() - discipline CPU clock oscillator to external PPS signal + * + * This routine is called at each PPS signal arrival in order to + * discipline the CPU clock oscillator to the PPS signal. It takes two + * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former + * is used to correct clock phase error and the latter is used to + * correct the frequency. + * + * This code is based on David Mills's reference nanokernel + * implementation. It was mostly rewritten but keeps the same idea. + */ +void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +{ + struct pps_normtime pts_norm, freq_norm; + unsigned long flags; + + pts_norm = pps_normalize_ts(*phase_ts); + + write_seqlock_irqsave(&xtime_lock, flags); + + /* clear the error bits, they will be set again if needed */ + time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + + /* indicate signal presence */ + time_status |= STA_PPSSIGNAL; + pps_valid = PPS_VALID; + + /* when called for the first time, + * just start the frequency interval */ + if (unlikely(pps_fbase.tv_sec == 0)) { + pps_fbase = *raw_ts; + write_sequnlock_irqrestore(&xtime_lock, flags); + return; + } + + /* ok, now we have a base for frequency calculation */ + freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); + + /* check that the signal is in the range + * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ + if ((freq_norm.sec == 0) || + (freq_norm.nsec > MAXFREQ * freq_norm.sec) || + (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { + time_status |= STA_PPSJITTER; + /* restart the frequency calibration interval */ + pps_fbase = *raw_ts; + write_sequnlock_irqrestore(&xtime_lock, flags); + pr_err("hardpps: PPSJITTER: bad pulse\n"); + return; + } + + /* signal is ok */ + + /* check if the current frequency interval is finished */ + if (freq_norm.sec >= (1 << pps_shift)) { + pps_calcnt++; + /* restart the frequency calibration interval */ + pps_fbase = *raw_ts; + hardpps_update_freq(freq_norm); + } + + hardpps_update_phase(pts_norm.nsec); + + write_sequnlock_irqrestore(&xtime_lock, flags); +} +EXPORT_SYMBOL(hardpps); + +#endif /* CONFIG_NTP_PPS */ + static int __init ntp_tick_adj_setup(char *str) { ntp_tick_adj = simple_strtol(str, NULL, 0); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b6b898d..051bc80 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -49,7 +49,7 @@ struct tick_device *tick_get_device(int cpu) */ int tick_is_oneshot_available(void) { - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); } diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index aada0e5..5cbc101 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -95,7 +95,7 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, */ int tick_program_event(ktime_t expires, int force) { - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); return tick_dev_program_event(dev, expires, force); } @@ -167,7 +167,7 @@ int tick_oneshot_mode_active(void) int ret; local_irq_save(flags); - ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT; + ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT; local_irq_restore(flags); return ret; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5bb86da..d27c756 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -49,7 +49,7 @@ struct timekeeper { u32 mult; }; -struct timekeeper timekeeper; +static struct timekeeper timekeeper; /** * timekeeper_setup_internals - Set up internals to use clocksource clock. @@ -164,7 +164,7 @@ static struct timespec total_sleep_time; /* * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ -struct timespec raw_time; +static struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -288,6 +288,49 @@ void ktime_get_ts(struct timespec *ts) } EXPORT_SYMBOL_GPL(ktime_get_ts); +#ifdef CONFIG_NTP_PPS + +/** + * getnstime_raw_and_real - get day and raw monotonic time in timespec format + * @ts_raw: pointer to the timespec to be set to raw monotonic time + * @ts_real: pointer to the timespec to be set to the time of day + * + * This function reads both the time of day and raw monotonic time at the + * same time atomically and stores the resulting timestamps in timespec + * format. + */ +void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) +{ + unsigned long seq; + s64 nsecs_raw, nsecs_real; + + WARN_ON_ONCE(timekeeping_suspended); + + do { + u32 arch_offset; + + seq = read_seqbegin(&xtime_lock); + + *ts_raw = raw_time; + *ts_real = xtime; + + nsecs_raw = timekeeping_get_ns_raw(); + nsecs_real = timekeeping_get_ns(); + + /* If arch requires, add in gettimeoffset() */ + arch_offset = arch_gettimeoffset(); + nsecs_raw += arch_offset; + nsecs_real += arch_offset; + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts_raw, nsecs_raw); + timespec_add_ns(ts_real, nsecs_real); +} +EXPORT_SYMBOL(getnstime_raw_and_real); + +#endif /* CONFIG_NTP_PPS */ + /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 53f3381..761c510 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o -obj-$(CONFIG_EVENT_TRACING) += power-traces.o +obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7b8ec02..153562d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -758,53 +758,58 @@ static void blk_add_trace_rq_complete(void *ignore, * @q: queue the io is for * @bio: the source bio * @what: the action + * @error: error, if any * * Description: * Records an action against a bio. Will log the bio offset + size. * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what) + u32 what, int error) { struct blk_trace *bt = q->blk_trace; if (likely(!bt)) return; + if (!error && !bio_flagged(bio, BIO_UPTODATE)) + error = EIO; + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, - !bio_flagged(bio, BIO_UPTODATE), 0, NULL); + error, 0, NULL); } static void blk_add_trace_bio_bounce(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); } static void blk_add_trace_bio_complete(void *ignore, - struct request_queue *q, struct bio *bio) + struct request_queue *q, struct bio *bio, + int error) { - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); } static void blk_add_trace_bio_backmerge(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); } static void blk_add_trace_bio_frontmerge(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); } static void blk_add_trace_bio_queue(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_QUEUE); + blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); } static void blk_add_trace_getrq(void *ignore, @@ -812,7 +817,7 @@ static void blk_add_trace_getrq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ); + blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); else { struct blk_trace *bt = q->blk_trace; @@ -827,7 +832,7 @@ static void blk_add_trace_sleeprq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); else { struct blk_trace *bt = q->blk_trace; @@ -887,7 +892,7 @@ static void blk_add_trace_split(void *ignore, } /** - * blk_add_trace_remap - Add a trace for a remap operation + * blk_add_trace_bio_remap - Add a trace for a bio-remap operation * @ignore: trace callback data parameter (not used) * @q: queue the io is for * @bio: the source bio @@ -899,9 +904,9 @@ static void blk_add_trace_split(void *ignore, * it spans a stripe (or similar). Add a trace for that action. * **/ -static void blk_add_trace_remap(void *ignore, - struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from) +static void blk_add_trace_bio_remap(void *ignore, + struct request_queue *q, struct bio *bio, + dev_t dev, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -1016,7 +1021,7 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_split(blk_add_trace_split, NULL); WARN_ON(ret); - ret = register_trace_block_remap(blk_add_trace_remap, NULL); + ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); WARN_ON(ret); ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); WARN_ON(ret); @@ -1025,7 +1030,7 @@ static void blk_register_tracepoints(void) static void blk_unregister_tracepoints(void) { unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); - unregister_trace_block_remap(blk_add_trace_remap, NULL); + unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8cf959..dc53ecb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) __this_cpu_inc(user_stack_count); - - event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) - return; + goto out_drop_count; entry = ring_buffer_event_data(event); entry->tgid = current->tgid; @@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); + out_drop_count: __this_cpu_dec(user_stack_count); - out: preempt_enable(); } diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e3dfeca..6cf2237 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -53,7 +53,7 @@ */ /* - * Function trace entry - function address and parent function addres: + * Function trace entry - function address and parent function address: */ FTRACE_ENTRY(function, ftrace_entry, diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 562c56e..659732e 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) static int trace_wakeup_test_thread(void *data) { /* Make this a RT thread, doesn't need to be too high */ - static struct sched_param param = { .sched_priority = 5 }; + static const struct sched_param param = { .sched_priority = 5 }; struct completion *x = data; sched_setscheduler(current, SCHED_FIFO, ¶m); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index bac752f..b706529 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event, static int syscall_enter_define_fields(struct ftrace_event_call *call); static int syscall_exit_define_fields(struct ftrace_event_call *call); -/* All syscall exit events have the same fields */ -static LIST_HEAD(syscall_exit_fields); - static struct list_head * syscall_get_enter_fields(struct ftrace_event_call *call) { @@ -34,34 +31,28 @@ syscall_get_enter_fields(struct ftrace_event_call *call) return &entry->enter_fields; } -static struct list_head * -syscall_get_exit_fields(struct ftrace_event_call *call) -{ - return &syscall_exit_fields; -} - struct trace_event_functions enter_syscall_print_funcs = { - .trace = print_syscall_enter, + .trace = print_syscall_enter, }; struct trace_event_functions exit_syscall_print_funcs = { - .trace = print_syscall_exit, + .trace = print_syscall_exit, }; struct ftrace_event_class event_class_syscall_enter = { - .system = "syscalls", - .reg = syscall_enter_register, - .define_fields = syscall_enter_define_fields, - .get_fields = syscall_get_enter_fields, - .raw_init = init_syscall_trace, + .system = "syscalls", + .reg = syscall_enter_register, + .define_fields = syscall_enter_define_fields, + .get_fields = syscall_get_enter_fields, + .raw_init = init_syscall_trace, }; struct ftrace_event_class event_class_syscall_exit = { - .system = "syscalls", - .reg = syscall_exit_register, - .define_fields = syscall_exit_define_fields, - .get_fields = syscall_get_exit_fields, - .raw_init = init_syscall_trace, + .system = "syscalls", + .reg = syscall_exit_register, + .define_fields = syscall_exit_define_fields, + .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), + .raw_init = init_syscall_trace, }; extern unsigned long __start_syscalls_metadata[]; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 2591583..9da289c 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -12,6 +12,8 @@ #include <linux/highuid.h> #include <linux/cred.h> +static struct kmem_cache *user_ns_cachep __read_mostly; + /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -26,7 +28,7 @@ int create_user_ns(struct cred *new) struct user_struct *root_user; int n; - ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); + ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); if (!ns) return -ENOMEM; @@ -38,7 +40,7 @@ int create_user_ns(struct cred *new) /* Alloc new root user. */ root_user = alloc_uid(ns, 0); if (!root_user) { - kfree(ns); + kmem_cache_free(user_ns_cachep, ns); return -ENOMEM; } @@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work) struct user_namespace *ns = container_of(work, struct user_namespace, destroyer); free_uid(ns->creator); - kfree(ns); + kmem_cache_free(user_ns_cachep, ns); } void free_user_ns(struct kref *kref) @@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t /* No useful relationship so no mapping */ return overflowgid; } + +static __init int user_namespaces_init(void) +{ + user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); + return 0; +} +module_init(user_namespaces_init); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6e7b575..d7ebdf4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -118,12 +118,12 @@ static void __touch_watchdog(void) { int this_cpu = smp_processor_id(); - __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); + __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu)); } void touch_softlockup_watchdog(void) { - __raw_get_cpu_var(watchdog_touch_ts) = 0; + __this_cpu_write(watchdog_touch_ts, 0); } EXPORT_SYMBOL(touch_softlockup_watchdog); @@ -167,12 +167,12 @@ void touch_softlockup_watchdog_sync(void) /* watchdog detector functions */ static int is_hardlockup(void) { - unsigned long hrint = __get_cpu_var(hrtimer_interrupts); + unsigned long hrint = __this_cpu_read(hrtimer_interrupts); - if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) + if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) return 1; - __get_cpu_var(hrtimer_interrupts_saved) = hrint; + __this_cpu_write(hrtimer_interrupts_saved, hrint); return 0; } #endif @@ -205,8 +205,8 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi, /* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; - if (__get_cpu_var(watchdog_nmi_touch) == true) { - __get_cpu_var(watchdog_nmi_touch) = false; + if (__this_cpu_read(watchdog_nmi_touch) == true) { + __this_cpu_write(watchdog_nmi_touch, false); return; } @@ -220,7 +220,7 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi, int this_cpu = smp_processor_id(); /* only print hardlockups once */ - if (__get_cpu_var(hard_watchdog_warn) == true) + if (__this_cpu_read(hard_watchdog_warn) == true) return; if (hardlockup_panic) @@ -228,16 +228,16 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi, else WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); - __get_cpu_var(hard_watchdog_warn) = true; + __this_cpu_write(hard_watchdog_warn, true); return; } - __get_cpu_var(hard_watchdog_warn) = false; + __this_cpu_write(hard_watchdog_warn, false); return; } static void watchdog_interrupt_count(void) { - __get_cpu_var(hrtimer_interrupts)++; + __this_cpu_inc(hrtimer_interrupts); } #else static inline void watchdog_interrupt_count(void) { return; } @@ -246,7 +246,7 @@ static inline void watchdog_interrupt_count(void) { return; } /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { - unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); + unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); struct pt_regs *regs = get_irq_regs(); int duration; @@ -254,18 +254,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) watchdog_interrupt_count(); /* kick the softlockup detector */ - wake_up_process(__get_cpu_var(softlockup_watchdog)); + wake_up_process(__this_cpu_read(softlockup_watchdog)); /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); if (touch_ts == 0) { - if (unlikely(__get_cpu_var(softlockup_touch_sync))) { + if (unlikely(__this_cpu_read(softlockup_touch_sync))) { /* * If the time stamp was touched atomically * make sure the scheduler tick is up to date. */ - __get_cpu_var(softlockup_touch_sync) = false; + __this_cpu_write(softlockup_touch_sync, false); sched_clock_tick(); } __touch_watchdog(); @@ -281,7 +281,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) duration = is_softlockup(touch_ts); if (unlikely(duration)) { /* only warn once */ - if (__get_cpu_var(soft_watchdog_warn) == true) + if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", @@ -296,9 +296,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (softlockup_panic) panic("softlockup: hung tasks"); - __get_cpu_var(soft_watchdog_warn) = true; + __this_cpu_write(soft_watchdog_warn, true); } else - __get_cpu_var(soft_watchdog_warn) = false; + __this_cpu_write(soft_watchdog_warn, false); return HRTIMER_RESTART; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e785b0f..8ee6ec8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -932,6 +932,38 @@ static void insert_work(struct cpu_workqueue_struct *cwq, wake_up_worker(gcwq); } +/* + * Test whether @work is being queued from another work executing on the + * same workqueue. This is rather expensive and should only be used from + * cold paths. + */ +static bool is_chained_work(struct workqueue_struct *wq) +{ + unsigned long flags; + unsigned int cpu; + + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct worker *worker; + struct hlist_node *pos; + int i; + + spin_lock_irqsave(&gcwq->lock, flags); + for_each_busy_worker(worker, i, pos, gcwq) { + if (worker->task != current) + continue; + spin_unlock_irqrestore(&gcwq->lock, flags); + /* + * I'm @worker, no locking necessary. See if @work + * is headed to the same workqueue. + */ + return worker->current_cwq->wq == wq; + } + spin_unlock_irqrestore(&gcwq->lock, flags); + } + return false; +} + static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct work_struct *work) { @@ -943,7 +975,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, debug_work_activate(work); - if (WARN_ON_ONCE(wq->flags & WQ_DYING)) + /* if dying, only works from the same workqueue are allowed */ + if (unlikely(wq->flags & WQ_DYING) && + WARN_ON_ONCE(!is_chained_work(wq))) return; /* determine gcwq to use */ @@ -2936,11 +2970,35 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); */ void destroy_workqueue(struct workqueue_struct *wq) { + unsigned int flush_cnt = 0; unsigned int cpu; + /* + * Mark @wq dying and drain all pending works. Once WQ_DYING is + * set, only chain queueing is allowed. IOW, only currently + * pending or running work items on @wq can queue further work + * items on it. @wq is flushed repeatedly until it becomes empty. + * The number of flushing is detemined by the depth of chaining and + * should be relatively short. Whine if it takes too long. + */ wq->flags |= WQ_DYING; +reflush: flush_workqueue(wq); + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (!cwq->nr_active && list_empty(&cwq->delayed_works)) + continue; + + if (++flush_cnt == 10 || + (flush_cnt % 100 == 0 && flush_cnt <= 1000)) + printk(KERN_WARNING "workqueue %s: flush on " + "destruction isn't complete after %u tries\n", + wq->name, flush_cnt); + goto reflush; + } + /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. |