diff options
Diffstat (limited to 'kernel')
96 files changed, 1247 insertions, 911 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 353d3fe..85cbfb3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 37b2bea..e99dda0 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -607,7 +607,7 @@ void audit_trim_trees(void) spin_lock(&hash_lock); list_for_each_entry(node, &tree->chunks, list) { struct audit_chunk *chunk = find_chunk(node); - /* this could be NULL if the watch is dieing else where... */ + /* this could be NULL if the watch is dying else where... */ struct inode *inode = chunk->mark.i.inode; node->index |= 1U<<31; if (iterate_mounts(compare_root, inode, root_mnt)) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f49a031..b33513a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1011,7 +1011,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, /* * to_send and len_sent accounting are very loose estimates. We aren't * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundry) + * within about 500 bytes (next page boundary) * * why snprintf? an int is up to 12 digits long. if we just assumed when * logging that a[%d]= was going to be 16 characters long we would be wasting diff --git a/kernel/bounds.c b/kernel/bounds.c index 98a51f2..0c9b862 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -9,11 +9,13 @@ #include <linux/page-flags.h> #include <linux/mmzone.h> #include <linux/kbuild.h> +#include <linux/page_cgroup.h> void foo(void) { /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); + DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); /* End of constants */ } diff --git a/kernel/capability.c b/kernel/capability.c index 9e9385f..bf0c734 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -14,6 +14,7 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/pid_namespace.h> +#include <linux/user_namespace.h> #include <asm/uaccess.h> /* @@ -290,6 +291,60 @@ error: } /** + * has_capability - Does a task have a capability in init_user_ns + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to the initial user namespace, false if not. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_capability(struct task_struct *t, int cap) +{ + int ret = security_real_capable(t, &init_user_ns, cap); + + return (ret == 0); +} + +/** + * has_capability - Does a task have a capability in a specific user ns + * @t: The task in question + * @ns: target user namespace + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to the specified user namespace, false if not. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_ns_capability(struct task_struct *t, + struct user_namespace *ns, int cap) +{ + int ret = security_real_capable(t, ns, cap); + + return (ret == 0); +} + +/** + * has_capability_noaudit - Does a task have a capability (unaudited) + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to init_user_ns, false if not. Don't write an + * audit message for the check. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_capability_noaudit(struct task_struct *t, int cap) +{ + int ret = security_real_capable_noaudit(t, &init_user_ns, cap); + + return (ret == 0); +} + +/** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * @@ -299,17 +354,48 @@ error: * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ -int capable(int cap) +bool capable(int cap) +{ + return ns_capable(&init_user_ns, cap); +} +EXPORT_SYMBOL(capable); + +/** + * ns_capable - Determine if the current task has a superior capability in effect + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable(struct user_namespace *ns, int cap) { if (unlikely(!cap_valid(cap))) { printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); BUG(); } - if (security_capable(current_cred(), cap) == 0) { + if (security_capable(ns, current_cred(), cap) == 0) { current->flags |= PF_SUPERPRIV; - return 1; + return true; } - return 0; + return false; } -EXPORT_SYMBOL(capable); +EXPORT_SYMBOL(ns_capable); + +/** + * task_ns_capable - Determine whether current task has a superior + * capability targeted at a specific task's user namespace. + * @t: The task whose user namespace is targeted. + * @cap: The capability in question. + * + * Return true if it does, false otherwise. + */ +bool task_ns_capable(struct task_struct *t, int cap) +{ + return ns_capable(task_cred_xxx(t, user)->user_ns, cap); +} +EXPORT_SYMBOL(task_ns_capable); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 95362d1..25c7eb5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -157,7 +157,7 @@ struct css_id { }; /* - * cgroup_event represents events which userspace want to recieve. + * cgroup_event represents events which userspace want to receive. */ struct cgroup_event { /* @@ -1813,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) /* Update the css_set linked lists if we're using them */ write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) { - list_del(&tsk->cg_list); - list_add(&tsk->cg_list, &newcg->tasks); - } + if (!list_empty(&tsk->cg_list)) + list_move(&tsk->cg_list, &newcg->tasks); write_unlock(&css_set_lock); for_each_subsys(root, ss) { @@ -3655,12 +3653,12 @@ again: spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) - list_del(&cgrp->release_list); + list_del_init(&cgrp->release_list); spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ - list_del(&cgrp->sibling); + list_del_init(&cgrp->sibling); cgroup_unlock_hierarchy(cgrp->root); d = dget(cgrp->dentry); @@ -3879,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) subsys[ss->subsys_id] = NULL; /* remove subsystem from rootnode's list of subsystems */ - list_del(&ss->sibling); + list_del_init(&ss->sibling); /* * disentangle the css from all css_sets attached to the dummytop. as @@ -4241,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) if (!list_empty(&tsk->cg_list)) { write_lock(&css_set_lock); if (!list_empty(&tsk->cg_list)) - list_del(&tsk->cg_list); + list_del_init(&tsk->cg_list); write_unlock(&css_set_lock); } diff --git a/kernel/cpu.c b/kernel/cpu.c index 156cc55..12b7458 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -126,7 +126,7 @@ static void cpu_hotplug_done(void) #else /* #if CONFIG_HOTPLUG_CPU */ static void cpu_hotplug_begin(void) {} static void cpu_hotplug_done(void) {} -#endif /* #esle #if CONFIG_HOTPLUG_CPU */ +#endif /* #else #if CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ int __ref register_cpu_notifier(struct notifier_block *nb) @@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v) { BUG_ON(cpu_notify(val, v)); } - EXPORT_SYMBOL(register_cpu_notifier); void __ref unregister_cpu_notifier(struct notifier_block *nb) @@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param) return err; cpu_notify(CPU_DYING | param->mod, param->hcpu); - return 0; } @@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -EINVAL; cpu_hotplug_begin(); + err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err) { nr_calls--; @@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; - printk("%s: attempt to bring up CPU %u failed\n", + printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", __func__, cpu); goto out_notify; } @@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void) if (cpumask_empty(frozen_cpus)) goto out; - printk("Enabling non-boot CPUs ...\n"); + printk(KERN_INFO "Enabling non-boot CPUs ...\n"); arch_enable_nonboot_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { - printk("CPU%d is up\n", cpu); + printk(KERN_INFO "CPU%d is up\n", cpu); continue; } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); @@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) */ /* cpu_bit_bitmap[0] is empty - so we can back into it */ -#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) +#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x)) #define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) #define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) #define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e92e981..33eee16 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; - NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); - - if (!newmems) - return; + static nodemask_t newmems; /* protected by cgroup_mutex */ cs = cgroup_cs(scan->cg); - guarantee_online_mems(cs, newmems); - - cpuset_change_task_nodemask(p, newmems); + guarantee_online_mems(cs, &newmems); - NODEMASK_FREE(newmems); + cpuset_change_task_nodemask(p, &newmems); mm = get_task_mm(p); if (!mm) @@ -1438,44 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); - NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); - - if (from == NULL || to == NULL) - goto alloc_fail; + static nodemask_t to; /* protected by cgroup_mutex */ if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); } else { guarantee_online_cpus(cs, cpus_attach); } - guarantee_online_mems(cs, to); + guarantee_online_mems(cs, &to); /* do per-task migration stuff possibly for each in the threadgroup */ - cpuset_attach_task(tsk, to, cs); + cpuset_attach_task(tsk, &to, cs); if (threadgroup) { struct task_struct *c; rcu_read_lock(); list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - cpuset_attach_task(c, to, cs); + cpuset_attach_task(c, &to, cs); } rcu_read_unlock(); } /* change mm; only needs to be done once even if threadgroup */ - *from = oldcs->mems_allowed; - *to = cs->mems_allowed; + to = cs->mems_allowed; mm = get_task_mm(tsk); if (mm) { - mpol_rebind_mm(mm, to); + mpol_rebind_mm(mm, &to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, from, to); + cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to); mmput(mm); } - -alloc_fail: - NODEMASK_FREE(from); - NODEMASK_FREE(to); } /* The various types of files and directories in a cpuset file system */ @@ -1610,34 +1596,26 @@ out: * across a page fault. */ -static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) +static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { - int ret; + size_t count; mutex_lock(&callback_mutex); - ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); + count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); mutex_unlock(&callback_mutex); - return ret; + return count; } -static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) +static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) { - NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); - int retval; - - if (mask == NULL) - return -ENOMEM; + size_t count; mutex_lock(&callback_mutex); - *mask = cs->mems_allowed; + count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed); mutex_unlock(&callback_mutex); - retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); - - NODEMASK_FREE(mask); - - return retval; + return count; } static ssize_t cpuset_common_file_read(struct cgroup *cont, @@ -1862,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, cs = cgroup_cs(cgroup); parent_cs = cgroup_cs(parent); + mutex_lock(&callback_mutex); cs->mems_allowed = parent_cs->mems_allowed; cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); + mutex_unlock(&callback_mutex); return; } @@ -2066,10 +2046,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) struct cpuset *cp; /* scans cpusets being updated */ struct cpuset *child; /* scans child cpusets of cp */ struct cgroup *cont; - NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); - - if (oldmems == NULL) - return; + static nodemask_t oldmems; /* protected by cgroup_mutex */ list_add_tail((struct list_head *)&root->stack_list, &queue); @@ -2086,7 +2063,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; - *oldmems = cp->mems_allowed; + oldmems = cp->mems_allowed; /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); @@ -2102,10 +2079,9 @@ static void scan_for_empty_cpusets(struct cpuset *root) remove_tasks_in_empty_cpuset(cp); else { update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, oldmems, NULL); + update_tasks_nodemask(cp, &oldmems, NULL); } } - NODEMASK_FREE(oldmems); } /* @@ -2147,19 +2123,16 @@ void cpuset_update_active_cpus(void) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); - - if (oldmems == NULL) - return NOTIFY_DONE; + static nodemask_t oldmems; /* protected by cgroup_mutex */ cgroup_lock(); switch (action) { case MEM_ONLINE: - *oldmems = top_cpuset.mems_allowed; + oldmems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, oldmems, NULL); + update_tasks_nodemask(&top_cpuset, &oldmems, NULL); break; case MEM_OFFLINE: /* @@ -2173,7 +2146,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self, } cgroup_unlock(); - NODEMASK_FREE(oldmems); return NOTIFY_OK; } #endif diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c new file mode 100644 index 0000000..5f85690 --- /dev/null +++ b/kernel/crash_dump.c @@ -0,0 +1,34 @@ +#include <linux/kernel.h> +#include <linux/crash_dump.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/module.h> + +/* + * If we have booted due to a crash, max_pfn will be a very low value. We need + * to know the amount of memory that the previous kernel used. + */ +unsigned long saved_max_pfn; + +/* + * stores the physical address of elf header of crash image + * + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence put + * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + +/* + * elfcorehdr= specifies the location of elf core header stored by the crashed + * kernel. This option will be passed by kexec loader to the capture kernel. + */ +static int __init setup_elfcorehdr(char *arg) +{ + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; +} +early_param("elfcorehdr", setup_elfcorehdr); diff --git a/kernel/cred.c b/kernel/cred.c index 2343c132..5557b55 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -741,6 +741,12 @@ int set_create_files_as(struct cred *new, struct inode *inode) } EXPORT_SYMBOL(set_create_files_as); +struct user_namespace *current_user_ns(void) +{ + return _current_user_ns(); +} +EXPORT_SYMBOL(current_user_ns); + #ifdef CONFIG_DEBUG_CREDENTIALS bool creds_are_invalid(const struct cred *cred) diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index cefd4a1..bad6786 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -538,7 +538,7 @@ return_normal: /* * For single stepping, try to only enter on the processor - * that was single stepping. To gaurd against a deadlock, the + * that was single stepping. To guard against a deadlock, the * kernel will only try for the value of sstep_tries before * giving up and continuing on. */ diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 481a7bd..a11db95 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) put_packet(remcom_out_buffer); return 0; } + +/** + * gdbstub_exit - Send an exit message to GDB + * @status: The exit code to report. + */ +void gdbstub_exit(int status) +{ + unsigned char checksum, ch, buffer[3]; + int loop; + + buffer[0] = 'W'; + buffer[1] = hex_asc_hi(status); + buffer[2] = hex_asc_lo(status); + + dbg_io_ops->write_char('$'); + checksum = 0; + + for (loop = 0; loop < 3; loop++) { + ch = buffer[loop]; + checksum += ch; + dbg_io_ops->write_char(ch); + } + + dbg_io_ops->write_char('#'); + dbg_io_ops->write_char(hex_asc_hi(checksum)); + dbg_io_ops->write_char(hex_asc_lo(checksum)); + + /* make sure the output is flushed, lest the bootloader clobber it */ + dbg_io_ops->flush(); +} diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index bd3e8e2..be14779 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -78,7 +78,7 @@ static unsigned int kdb_continue_catastrophic; static kdbtab_t *kdb_commands; #define KDB_BASE_CMD_MAX 50 static int kdb_max_commands = KDB_BASE_CMD_MAX; -static kdbtab_t kdb_base_commands[50]; +static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX]; #define for_each_kdbcmd(cmd, num) \ for ((cmd) = kdb_base_commands, (num) = 0; \ num < kdb_max_commands; \ @@ -441,9 +441,9 @@ static int kdb_check_regs(void) * symbol name, and offset to the caller. * * The argument may consist of a numeric value (decimal or - * hexidecimal), a symbol name, a register name (preceeded by the + * hexidecimal), a symbol name, a register name (preceded by the * percent sign), an environment variable with a numeric value - * (preceeded by a dollar sign) or a simple arithmetic expression + * (preceded by a dollar sign) or a simple arithmetic expression * consisting of a symbol name, +/-, and a numeric constant value * (offset). * Parameters: @@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value) * error The hardware-defined error code * reason2 kdb's current reason code. * Initially error but can change - * acording to kdb state. + * according to kdb state. * db_result Result code from break or debug point. * regs The exception frame at time of fault/breakpoint. * should always be valid. @@ -2892,7 +2892,7 @@ static void __init kdb_inittab(void) "Send a signal to a process", 0, KDB_REPEAT_NONE); kdb_register_repeat("summary", kdb_summary, "", "Summarize the system", 4, KDB_REPEAT_NONE); - kdb_register_repeat("per_cpu", kdb_per_cpu, "", + kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", "Display per_cpu variables", 3, KDB_REPEAT_NONE); kdb_register_repeat("grephelp", kdb_grep_help, "", "Display help on | grep", 0, KDB_REPEAT_NONE); diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 6b2485d..5532dd3 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size) * Mask for process state. * Notes: * The mask folds data from several sources into a single long value, so - * be carefull not to overlap the bits. TASK_* bits are in the LSB, + * be careful not to overlap the bits. TASK_* bits are in the LSB, * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there * is no overlap between TASK_* and EXIT_* but that may not always be * true, so EXIT_* bits are shifted left 16 bits before being stored in diff --git a/kernel/exit.c b/kernel/exit.c index f9a45eb..f5d2f63 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -841,7 +841,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* Let father know we died * * Thread signals are configurable, but you aren't going to use - * that to send signals to arbitary processes. + * that to send signals to arbitrary processes. * That stops right now. * * If the parent exec id doesn't match the exec id we saved @@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code) profile_task_exit(tsk); WARN_ON(atomic_read(&tsk->fs_excl)); + WARN_ON(blk_needs_flush_plug(tsk)); if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); diff --git a/kernel/fork.c b/kernel/fork.c index 05b92c4..e7548de 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,6 +40,7 @@ #include <linux/tracehook.h> #include <linux/futex.h> #include <linux/compat.h> +#include <linux/kthread.h> #include <linux/task_io_accounting_ops.h> #include <linux/rcupdate.h> #include <linux/ptrace.h> @@ -109,20 +110,25 @@ int nr_processes(void) } #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR -# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) -# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) +# define alloc_task_struct_node(node) \ + kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) +# define free_task_struct(tsk) \ + kmem_cache_free(task_struct_cachep, (tsk)) static struct kmem_cache *task_struct_cachep; #endif #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR -static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) +static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, + int node) { #ifdef CONFIG_DEBUG_STACK_USAGE gfp_t mask = GFP_KERNEL | __GFP_ZERO; #else gfp_t mask = GFP_KERNEL; #endif - return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); + struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); + + return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) @@ -249,16 +255,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) struct task_struct *tsk; struct thread_info *ti; unsigned long *stackend; - + int node = tsk_fork_get_node(orig); int err; prepare_to_copy(orig); - tsk = alloc_task_struct(); + tsk = alloc_task_struct_node(node); if (!tsk) return NULL; - ti = alloc_thread_info(tsk); + ti = alloc_thread_info_node(tsk, node); if (!ti) { free_task_struct(tsk); return NULL; @@ -1181,12 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, pid = alloc_pid(p->nsproxy->pid_ns); if (!pid) goto bad_fork_cleanup_io; - - if (clone_flags & CLONE_NEWPID) { - retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); - if (retval < 0) - goto bad_fork_free_pid; - } } p->pid = pid_nr(pid); @@ -1205,6 +1205,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; +#ifdef CONFIG_BLOCK + p->plug = NULL; +#endif #ifdef CONFIG_FUTEX p->robust_list = NULL; #ifdef CONFIG_COMPAT @@ -1290,7 +1293,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { - if (clone_flags & CLONE_NEWPID) + if (is_child_reaper(pid)) p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; @@ -1513,38 +1516,24 @@ void __init proc_caches_init(void) } /* - * Check constraints on flags passed to the unshare system call and - * force unsharing of additional process context as appropriate. + * Check constraints on flags passed to the unshare system call. */ -static void check_unshare_flags(unsigned long *flags_ptr) +static int check_unshare_flags(unsigned long unshare_flags) { + if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + return -EINVAL; /* - * If unsharing a thread from a thread group, must also - * unshare vm. - */ - if (*flags_ptr & CLONE_THREAD) - *flags_ptr |= CLONE_VM; - - /* - * If unsharing vm, must also unshare signal handlers. - */ - if (*flags_ptr & CLONE_VM) - *flags_ptr |= CLONE_SIGHAND; - - /* - * If unsharing namespace, must also unshare filesystem information. + * Not implemented, but pretend it works if there is nothing to + * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND + * needs to unshare vm. */ - if (*flags_ptr & CLONE_NEWNS) - *flags_ptr |= CLONE_FS; -} - -/* - * Unsharing of tasks created with CLONE_THREAD is not supported yet - */ -static int unshare_thread(unsigned long unshare_flags) -{ - if (unshare_flags & CLONE_THREAD) - return -EINVAL; + if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { + /* FIXME: get_task_mm() increments ->mm_users */ + if (atomic_read(¤t->mm->mm_users) > 1) + return -EINVAL; + } return 0; } @@ -1571,34 +1560,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) } /* - * Unsharing of sighand is not supported yet - */ -static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) -{ - struct sighand_struct *sigh = current->sighand; - - if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) - return -EINVAL; - else - return 0; -} - -/* - * Unshare vm if it is being shared - */ -static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) -{ - struct mm_struct *mm = current->mm; - - if ((unshare_flags & CLONE_VM) && - (mm && atomic_read(&mm->mm_users) > 1)) { - return -EINVAL; - } - - return 0; -} - -/* * Unshare file descriptor table if it is being shared */ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) @@ -1626,45 +1587,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp */ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { - int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct sighand_struct *new_sigh = NULL; - struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; + int err; - check_unshare_flags(&unshare_flags); - - /* Return -EINVAL for all unsupported flags */ - err = -EINVAL; - if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + err = check_unshare_flags(unshare_flags); + if (err) goto bad_unshare_out; /* + * If unsharing namespace, must also unshare filesystem information. + */ + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old * namespace are unreachable. */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; - if ((err = unshare_thread(unshare_flags))) - goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) - goto bad_unshare_cleanup_thread; - if ((err = unshare_sighand(unshare_flags, &new_sigh))) - goto bad_unshare_cleanup_fs; - if ((err = unshare_vm(unshare_flags, &new_mm))) - goto bad_unshare_cleanup_sigh; + goto bad_unshare_out; if ((err = unshare_fd(unshare_flags, &new_fd))) - goto bad_unshare_cleanup_vm; + goto bad_unshare_cleanup_fs; if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs))) goto bad_unshare_cleanup_fd; - if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -1690,19 +1643,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) spin_unlock(&fs->lock); } - if (new_mm) { - mm = current->mm; - active_mm = current->active_mm; - current->mm = new_mm; - current->active_mm = new_mm; - if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { - atomic_dec(&mm->oom_disable_count); - atomic_inc(&new_mm->oom_disable_count); - } - activate_mm(active_mm, new_mm); - new_mm = mm; - } - if (new_fd) { fd = current->files; current->files = new_fd; @@ -1719,20 +1659,10 @@ bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); -bad_unshare_cleanup_vm: - if (new_mm) - mmput(new_mm); - -bad_unshare_cleanup_sigh: - if (new_sigh) - if (atomic_dec_and_test(&new_sigh->count)) - kmem_cache_free(sighand_cachep, new_sigh); - bad_unshare_cleanup_fs: if (new_fs) free_fs_struct(new_fs); -bad_unshare_cleanup_thread: bad_unshare_out: return err; } diff --git a/kernel/futex.c b/kernel/futex.c index bda4157..fe28dc2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -782,8 +782,8 @@ static void __unqueue_futex(struct futex_q *q) { struct futex_hash_bucket *hb; - if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr) - || plist_node_empty(&q->list))) + if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) + || WARN_ON(plist_node_empty(&q->list))) return; hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); @@ -1886,7 +1886,7 @@ retry: restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; - restart->futex.flags = flags; + restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; ret = -ERESTART_RESTARTBLOCK; @@ -2418,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, goto err_unlock; ret = -EPERM; pcred = __task_cred(p); + /* If victim is in different user_ns, then uids are not + comparable, so we must have CAP_SYS_PTRACE */ + if (cred->user->user_ns != pcred->user->user_ns) { + if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) + goto err_unlock; + goto ok; + } + /* If victim is in same user_ns, then uids are comparable */ if (cred->euid != pcred->euid && cred->euid != pcred->uid && - !capable(CAP_SYS_PTRACE)) + !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) goto err_unlock; +ok: head = p->robust_list; rcu_read_unlock(); } diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index a7934ac..5f9e689 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, goto err_unlock; ret = -EPERM; pcred = __task_cred(p); + /* If victim is in different user_ns, then uids are not + comparable, so we must have CAP_SYS_PTRACE */ + if (cred->user->user_ns != pcred->user->user_ns) { + if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) + goto err_unlock; + goto ok; + } + /* If victim is in same user_ns, then uids are comparable */ if (cred->euid != pcred->euid && cred->euid != pcred->uid && - !capable(CAP_SYS_PTRACE)) + !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) goto err_unlock; +ok: head = p->compat_robust_list; rcu_read_unlock(); } diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 3f76100..e97ca59 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -1,3 +1,3 @@ -EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' +ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o diff --git a/kernel/groups.c b/kernel/groups.c index 253dc0f..1cc476d 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!capable(CAP_SETGID)) + if (!nsown_capable(CAP_SETGID)) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 09bef82..c574f9a 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -10,13 +10,6 @@ menu "IRQ subsystem" config GENERIC_HARDIRQS def_bool y -# Select this to disable the deprecated stuff -config GENERIC_HARDIRQS_NO_DEPRECATED - bool - -config GENERIC_HARDIRQS_NO_COMPAT - bool - # Options selectable by the architecture code # Make sparse irq Kconfig switch below available @@ -31,6 +24,10 @@ config GENERIC_IRQ_PROBE config GENERIC_IRQ_SHOW bool +# Print level/edge extra information +config GENERIC_IRQ_SHOW_LEVEL + bool + # Support for delayed migration from interrupt context config GENERIC_PENDING_IRQ bool @@ -47,6 +44,10 @@ config HARDIRQS_SW_RESEND config IRQ_PREFLOW_FASTEOI bool +# Edge style eoi based handler (cell) +config IRQ_EDGE_EOI_HANDLER + bool + # Support forced irq threading config IRQ_FORCED_THREADING bool diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 394784c..342d8f4 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -70,10 +70,8 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc)) { - irq_compat_set_pending(desc); + if (irq_startup(desc)) desc->istate |= IRQS_PENDING; - } } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c9c0601..4af1e2b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -34,9 +34,14 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) if (!chip) chip = &no_irq_chip; - irq_chip_set_defaults(chip); desc->irq_data.chip = chip; irq_put_desc_unlock(desc, flags); + /* + * For !CONFIG_SPARSE_IRQ make the irq show up in + * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is + * already marked, and this call is harmless. + */ + irq_reserve_irq(irq); return 0; } EXPORT_SYMBOL(irq_set_chip); @@ -134,26 +139,22 @@ EXPORT_SYMBOL_GPL(irq_get_irq_data); static void irq_state_clr_disabled(struct irq_desc *desc) { - desc->istate &= ~IRQS_DISABLED; - irq_compat_clr_disabled(desc); + irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); } static void irq_state_set_disabled(struct irq_desc *desc) { - desc->istate |= IRQS_DISABLED; - irq_compat_set_disabled(desc); + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); } static void irq_state_clr_masked(struct irq_desc *desc) { - desc->istate &= ~IRQS_MASKED; - irq_compat_clr_masked(desc); + irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); } static void irq_state_set_masked(struct irq_desc *desc) { - desc->istate |= IRQS_MASKED; - irq_compat_set_masked(desc); + irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); } int irq_startup(struct irq_desc *desc) @@ -203,126 +204,6 @@ void irq_disable(struct irq_desc *desc) } } -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -/* Temporary migration helpers */ -static void compat_irq_mask(struct irq_data *data) -{ - data->chip->mask(data->irq); -} - -static void compat_irq_unmask(struct irq_data *data) -{ - data->chip->unmask(data->irq); -} - -static void compat_irq_ack(struct irq_data *data) -{ - data->chip->ack(data->irq); -} - -static void compat_irq_mask_ack(struct irq_data *data) -{ - data->chip->mask_ack(data->irq); -} - -static void compat_irq_eoi(struct irq_data *data) -{ - data->chip->eoi(data->irq); -} - -static void compat_irq_enable(struct irq_data *data) -{ - data->chip->enable(data->irq); -} - -static void compat_irq_disable(struct irq_data *data) -{ - data->chip->disable(data->irq); -} - -static void compat_irq_shutdown(struct irq_data *data) -{ - data->chip->shutdown(data->irq); -} - -static unsigned int compat_irq_startup(struct irq_data *data) -{ - return data->chip->startup(data->irq); -} - -static int compat_irq_set_affinity(struct irq_data *data, - const struct cpumask *dest, bool force) -{ - return data->chip->set_affinity(data->irq, dest); -} - -static int compat_irq_set_type(struct irq_data *data, unsigned int type) -{ - return data->chip->set_type(data->irq, type); -} - -static int compat_irq_set_wake(struct irq_data *data, unsigned int on) -{ - return data->chip->set_wake(data->irq, on); -} - -static int compat_irq_retrigger(struct irq_data *data) -{ - return data->chip->retrigger(data->irq); -} - -static void compat_bus_lock(struct irq_data *data) -{ - data->chip->bus_lock(data->irq); -} - -static void compat_bus_sync_unlock(struct irq_data *data) -{ - data->chip->bus_sync_unlock(data->irq); -} -#endif - -/* - * Fixup enable/disable function pointers - */ -void irq_chip_set_defaults(struct irq_chip *chip) -{ -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED - if (chip->enable) - chip->irq_enable = compat_irq_enable; - if (chip->disable) - chip->irq_disable = compat_irq_disable; - if (chip->shutdown) - chip->irq_shutdown = compat_irq_shutdown; - if (chip->startup) - chip->irq_startup = compat_irq_startup; - if (!chip->end) - chip->end = dummy_irq_chip.end; - if (chip->bus_lock) - chip->irq_bus_lock = compat_bus_lock; - if (chip->bus_sync_unlock) - chip->irq_bus_sync_unlock = compat_bus_sync_unlock; - if (chip->mask) - chip->irq_mask = compat_irq_mask; - if (chip->unmask) - chip->irq_unmask = compat_irq_unmask; - if (chip->ack) - chip->irq_ack = compat_irq_ack; - if (chip->mask_ack) - chip->irq_mask_ack = compat_irq_mask_ack; - if (chip->eoi) - chip->irq_eoi = compat_irq_eoi; - if (chip->set_affinity) - chip->irq_set_affinity = compat_irq_set_affinity; - if (chip->set_type) - chip->irq_set_type = compat_irq_set_type; - if (chip->set_wake) - chip->irq_set_wake = compat_irq_set_wake; - if (chip->retrigger) - chip->irq_retrigger = compat_irq_retrigger; -#endif -} - static inline void mask_ack_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_mask_ack) @@ -372,11 +253,10 @@ void handle_nested_irq(unsigned int irq) kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; - if (unlikely(!action || (desc->istate & IRQS_DISABLED))) + if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; - irq_compat_set_progress(desc); - desc->istate |= IRQS_INPROGRESS; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); action_ret = action->thread_fn(action->irq, action->dev_id); @@ -384,8 +264,7 @@ void handle_nested_irq(unsigned int irq) note_interrupt(irq, desc, action_ret); raw_spin_lock_irq(&desc->lock); - desc->istate &= ~IRQS_INPROGRESS; - irq_compat_clr_progress(desc); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); out_unlock: raw_spin_unlock_irq(&desc->lock); @@ -416,14 +295,14 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) if (!irq_check_poll(desc)) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); - if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; handle_irq_event(desc); @@ -448,7 +327,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); mask_ack_irq(desc); - if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) if (!irq_check_poll(desc)) goto out_unlock; @@ -459,12 +338,12 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * keep it masked and get out of here */ - if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; handle_irq_event(desc); - if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT))) + if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); @@ -496,7 +375,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) if (!irq_check_poll(desc)) goto out; @@ -507,8 +386,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * then mask it and get out of here: */ - if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) { - irq_compat_set_pending(desc); + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; mask_irq(desc); goto out; @@ -537,7 +415,7 @@ out: * @desc: the interrupt description structure for this irq * * Interrupt occures on the falling and/or rising edge of a hardware - * signal. The occurence is latched into the irq controller hardware + * signal. The occurrence is latched into the irq controller hardware * and must be acked in order to be reenabled. After the ack another * interrupt can happen on the same source even before the first one * is handled by the associated event handler. If this happens it @@ -558,10 +436,9 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * we shouldn't process the IRQ. Mark it pending, handle * the necessary masking and go out */ - if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) || - !desc->action))) { + if (unlikely(irqd_irq_disabled(&desc->irq_data) || + irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { if (!irq_check_poll(desc)) { - irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; mask_ack_irq(desc); goto out_unlock; @@ -584,20 +461,65 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * Renable it, if it was not disabled in meantime. */ if (unlikely(desc->istate & IRQS_PENDING)) { - if (!(desc->istate & IRQS_DISABLED) && - (desc->istate & IRQS_MASKED)) + if (!irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data)) unmask_irq(desc); } handle_irq_event(desc); } while ((desc->istate & IRQS_PENDING) && - !(desc->istate & IRQS_DISABLED)); + !irqd_irq_disabled(&desc->irq_data)); out_unlock: raw_spin_unlock(&desc->lock); } +#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER +/** + * handle_edge_eoi_irq - edge eoi type IRQ handler + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * + * Similar as the above handle_edge_irq, but using eoi and w/o the + * mask/unmask logic. + */ +void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + + raw_spin_lock(&desc->lock); + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + /* + * If we're currently running this IRQ, or its disabled, + * we shouldn't process the IRQ. Mark it pending, handle + * the necessary masking and go out + */ + if (unlikely(irqd_irq_disabled(&desc->irq_data) || + irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { + if (!irq_check_poll(desc)) { + desc->istate |= IRQS_PENDING; + goto out_eoi; + } + } + kstat_incr_irqs_this_cpu(irq, desc); + + do { + if (unlikely(!desc->action)) + goto out_eoi; + + handle_irq_event(desc); + + } while ((desc->istate & IRQS_PENDING) && + !irqd_irq_disabled(&desc->irq_data)); + +out_eoi: + chip->irq_eoi(&desc->irq_data); + raw_spin_unlock(&desc->lock); +} +#endif + /** * handle_percpu_irq - Per CPU local irq handler * @irq: the interrupt number @@ -642,8 +564,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (handle == handle_bad_irq) { if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); - irq_compat_set_disabled(desc); - desc->istate |= IRQS_DISABLED; + irq_state_set_disabled(desc); desc->depth = 1; } desc->handle_irq = handle; @@ -684,8 +605,70 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irqd_set(&desc->irq_data, IRQD_PER_CPU); if (irq_settings_can_move_pcntxt(desc)) irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); + if (irq_settings_is_level(desc)) + irqd_set(&desc->irq_data, IRQD_LEVEL); irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); irq_put_desc_unlock(desc, flags); } + +/** + * irq_cpu_online - Invoke all irq_cpu_online functions. + * + * Iterate through all irqs and invoke the chip.irq_cpu_online() + * for each. + */ +void irq_cpu_online(void) +{ + struct irq_desc *desc; + struct irq_chip *chip; + unsigned long flags; + unsigned int irq; + + for_each_active_irq(irq) { + desc = irq_to_desc(irq); + if (!desc) + continue; + + raw_spin_lock_irqsave(&desc->lock, flags); + + chip = irq_data_get_irq_chip(&desc->irq_data); + if (chip && chip->irq_cpu_online && + (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || + !irqd_irq_disabled(&desc->irq_data))) + chip->irq_cpu_online(&desc->irq_data); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + } +} + +/** + * irq_cpu_offline - Invoke all irq_cpu_offline functions. + * + * Iterate through all irqs and invoke the chip.irq_cpu_offline() + * for each. + */ +void irq_cpu_offline(void) +{ + struct irq_desc *desc; + struct irq_chip *chip; + unsigned long flags; + unsigned int irq; + + for_each_active_irq(irq) { + desc = irq_to_desc(irq); + if (!desc) + continue; + + raw_spin_lock_irqsave(&desc->lock, flags); + + chip = irq_data_get_irq_chip(&desc->irq_data); + if (chip && chip->irq_cpu_offline && + (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || + !irqd_irq_disabled(&desc->irq_data))) + chip->irq_cpu_offline(&desc->irq_data); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + } +} diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h deleted file mode 100644 index 6bbaf66..0000000 --- a/kernel/irq/compat.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Compat layer for transition period - */ -#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT -static inline void irq_compat_set_progress(struct irq_desc *desc) -{ - desc->status |= IRQ_INPROGRESS; -} - -static inline void irq_compat_clr_progress(struct irq_desc *desc) -{ - desc->status &= ~IRQ_INPROGRESS; -} -static inline void irq_compat_set_disabled(struct irq_desc *desc) -{ - desc->status |= IRQ_DISABLED; -} -static inline void irq_compat_clr_disabled(struct irq_desc *desc) -{ - desc->status &= ~IRQ_DISABLED; -} -static inline void irq_compat_set_pending(struct irq_desc *desc) -{ - desc->status |= IRQ_PENDING; -} - -static inline void irq_compat_clr_pending(struct irq_desc *desc) -{ - desc->status &= ~IRQ_PENDING; -} -static inline void irq_compat_set_masked(struct irq_desc *desc) -{ - desc->status |= IRQ_MASKED; -} - -static inline void irq_compat_clr_masked(struct irq_desc *desc) -{ - desc->status &= ~IRQ_MASKED; -} -static inline void irq_compat_set_move_pending(struct irq_desc *desc) -{ - desc->status |= IRQ_MOVE_PENDING; -} - -static inline void irq_compat_clr_move_pending(struct irq_desc *desc) -{ - desc->status &= ~IRQ_MOVE_PENDING; -} -static inline void irq_compat_set_affinity(struct irq_desc *desc) -{ - desc->status |= IRQ_AFFINITY_SET; -} - -static inline void irq_compat_clr_affinity(struct irq_desc *desc) -{ - desc->status &= ~IRQ_AFFINITY_SET; -} -#else -static inline void irq_compat_set_progress(struct irq_desc *desc) { } -static inline void irq_compat_clr_progress(struct irq_desc *desc) { } -static inline void irq_compat_set_disabled(struct irq_desc *desc) { } -static inline void irq_compat_clr_disabled(struct irq_desc *desc) { } -static inline void irq_compat_set_pending(struct irq_desc *desc) { } -static inline void irq_compat_clr_pending(struct irq_desc *desc) { } -static inline void irq_compat_set_masked(struct irq_desc *desc) { } -static inline void irq_compat_clr_masked(struct irq_desc *desc) { } -static inline void irq_compat_set_move_pending(struct irq_desc *desc) { } -static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { } -static inline void irq_compat_set_affinity(struct irq_desc *desc) { } -static inline void irq_compat_clr_affinity(struct irq_desc *desc) { } -#endif - diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index d1a33b7..306cba3 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -4,8 +4,10 @@ #include <linux/kallsyms.h> -#define P(f) if (desc->status & f) printk("%14s set\n", #f) +#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) #define PS(f) if (desc->istate & f) printk("%14s set\n", #f) +/* FIXME */ +#define PD(f) do { } while (0) static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) { @@ -28,13 +30,15 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) P(IRQ_NOAUTOEN); PS(IRQS_AUTODETECT); - PS(IRQS_INPROGRESS); PS(IRQS_REPLAY); PS(IRQS_WAITING); - PS(IRQS_DISABLED); PS(IRQS_PENDING); - PS(IRQS_MASKED); + + PD(IRQS_INPROGRESS); + PD(IRQS_DISABLED); + PD(IRQS_MASKED); } #undef P #undef PS +#undef PD diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index 20dc547..b5fcd96 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c @@ -31,13 +31,6 @@ static unsigned int noop_ret(struct irq_data *data) return 0; } -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -static void compat_noop(unsigned int irq) { } -#define END_INIT .end = compat_noop -#else -#define END_INIT -#endif - /* * Generic no controller implementation */ @@ -48,7 +41,6 @@ struct irq_chip no_irq_chip = { .irq_enable = noop, .irq_disable = noop, .irq_ack = ack_bad, - END_INIT }; /* @@ -64,5 +56,4 @@ struct irq_chip dummy_irq_chip = { .irq_ack = noop, .irq_mask = noop, .irq_unmask = noop, - END_INIT }; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 517561f..90cb55f 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -175,28 +175,13 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) struct irqaction *action = desc->action; irqreturn_t ret; - irq_compat_clr_pending(desc); desc->istate &= ~IRQS_PENDING; - irq_compat_set_progress(desc); - desc->istate |= IRQS_INPROGRESS; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock(&desc->lock); ret = handle_irq_event_percpu(desc, action); raw_spin_lock(&desc->lock); - desc->istate &= ~IRQS_INPROGRESS; - irq_compat_clr_progress(desc); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); return ret; } - -/** - * handle_IRQ_event - irq action chain handler - * @irq: the interrupt number - * @action: the interrupt action chain for this irq - * - * Handles the action chain of an irq event - */ -irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) -{ - return handle_irq_event_percpu(irq_to_desc(irq), action); -} diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 6c6ec9a..6546431 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -15,10 +15,6 @@ #define istate core_internal_state__do_not_mess_with_it -#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT -# define status status_use_accessors -#endif - extern int noirqdebug; /* @@ -44,38 +40,28 @@ enum { * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt * detection * IRQS_POLL_INPROGRESS - polling in progress - * IRQS_INPROGRESS - Interrupt in progress * IRQS_ONESHOT - irq is not unmasked in primary handler * IRQS_REPLAY - irq is replayed * IRQS_WAITING - irq is waiting - * IRQS_DISABLED - irq is disabled * IRQS_PENDING - irq is pending and replayed later - * IRQS_MASKED - irq is masked * IRQS_SUSPENDED - irq is suspended */ enum { IRQS_AUTODETECT = 0x00000001, IRQS_SPURIOUS_DISABLED = 0x00000002, IRQS_POLL_INPROGRESS = 0x00000008, - IRQS_INPROGRESS = 0x00000010, IRQS_ONESHOT = 0x00000020, IRQS_REPLAY = 0x00000040, IRQS_WAITING = 0x00000080, - IRQS_DISABLED = 0x00000100, IRQS_PENDING = 0x00000200, - IRQS_MASKED = 0x00000400, IRQS_SUSPENDED = 0x00000800, }; -#include "compat.h" #include "debug.h" #include "settings.h" #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) -/* Set default functions for irq_chip structures: */ -extern void irq_chip_set_defaults(struct irq_chip *chip); - extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); @@ -162,13 +148,11 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) static inline void irqd_set_move_pending(struct irq_data *d) { d->state_use_accessors |= IRQD_SETAFFINITY_PENDING; - irq_compat_set_move_pending(irq_data_to_desc(d)); } static inline void irqd_clr_move_pending(struct irq_data *d) { d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; - irq_compat_clr_move_pending(irq_data_to_desc(d)); } static inline void irqd_clear(struct irq_data *d, unsigned int mask) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index dbccc79..2c039c9 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -80,7 +80,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_data.handler_data = NULL; desc->irq_data.msi_desc = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); - desc->istate = IRQS_DISABLED; + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; @@ -198,15 +198,6 @@ err: return -ENOMEM; } -struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) -{ - int res = irq_alloc_descs(irq, irq, 1, node); - - if (res == -EEXIST || res == irq) - return irq_to_desc(irq); - return NULL; -} - static int irq_expand_nr_irqs(unsigned int nr) { if (nr > IRQ_BITMAP_BITS) @@ -247,7 +238,6 @@ int __init early_irq_init(void) struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { - .istate = IRQS_DISABLED, .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), @@ -283,11 +273,6 @@ struct irq_desc *irq_to_desc(unsigned int irq) return (irq < NR_IRQS) ? irq_desc + irq : NULL; } -struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) -{ - return irq_to_desc(irq); -} - static void free_desc(unsigned int irq) { dynamic_irq_cleanup(irq); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0a2aa73..07c1611 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -41,7 +41,7 @@ early_param("threadirqs", setup_forced_irqthreads); void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - unsigned int state; + bool inprogress; if (!desc) return; @@ -53,16 +53,16 @@ void synchronize_irq(unsigned int irq) * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ - while (desc->istate & IRQS_INPROGRESS) + while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ raw_spin_lock_irqsave(&desc->lock, flags); - state = desc->istate; + inprogress = irqd_irq_inprogress(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ - } while (state & IRQS_INPROGRESS); + } while (inprogress); /* * We made sure that no hardirq handler is running. Now verify @@ -112,13 +112,13 @@ void irq_set_thread_affinity(struct irq_desc *desc) } #ifdef CONFIG_GENERIC_PENDING_IRQ -static inline bool irq_can_move_pcntxt(struct irq_desc *desc) +static inline bool irq_can_move_pcntxt(struct irq_data *data) { - return irq_settings_can_move_pcntxt(desc); + return irqd_can_move_in_process_context(data); } -static inline bool irq_move_pending(struct irq_desc *desc) +static inline bool irq_move_pending(struct irq_data *data) { - return irqd_is_setaffinity_pending(&desc->irq_data); + return irqd_is_setaffinity_pending(data); } static inline void irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) @@ -131,43 +131,34 @@ irq_get_pending(struct cpumask *mask, struct irq_desc *desc) cpumask_copy(mask, desc->pending_mask); } #else -static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; } -static inline bool irq_move_pending(struct irq_desc *desc) { return false; } +static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; } +static inline bool irq_move_pending(struct irq_data *data) { return false; } static inline void irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } static inline void irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } #endif -/** - * irq_set_affinity - Set the irq affinity of a given irq - * @irq: Interrupt to set affinity - * @cpumask: cpumask - * - */ -int irq_set_affinity(unsigned int irq, const struct cpumask *mask) +int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_chip *chip = desc->irq_data.chip; - unsigned long flags; + struct irq_chip *chip = irq_data_get_irq_chip(data); + struct irq_desc *desc = irq_data_to_desc(data); int ret = 0; - if (!chip->irq_set_affinity) + if (!chip || !chip->irq_set_affinity) return -EINVAL; - raw_spin_lock_irqsave(&desc->lock, flags); - - if (irq_can_move_pcntxt(desc)) { - ret = chip->irq_set_affinity(&desc->irq_data, mask, false); + if (irq_can_move_pcntxt(data)) { + ret = chip->irq_set_affinity(data, mask, false); switch (ret) { case IRQ_SET_MASK_OK: - cpumask_copy(desc->irq_data.affinity, mask); + cpumask_copy(data->affinity, mask); case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); ret = 0; } } else { - irqd_set_move_pending(&desc->irq_data); + irqd_set_move_pending(data); irq_copy_pending(desc, mask); } @@ -175,8 +166,28 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) kref_get(&desc->affinity_notify->kref); schedule_work(&desc->affinity_notify->work); } - irq_compat_set_affinity(desc); - irqd_set(&desc->irq_data, IRQD_AFFINITY_SET); + irqd_set(data, IRQD_AFFINITY_SET); + + return ret; +} + +/** + * irq_set_affinity - Set the irq affinity of a given irq + * @irq: Interrupt to set affinity + * @mask: cpumask + * + */ +int irq_set_affinity(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + int ret; + + if (!desc) + return -EINVAL; + + raw_spin_lock_irqsave(&desc->lock, flags); + ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } @@ -206,7 +217,7 @@ static void irq_affinity_notify(struct work_struct *work) goto out; raw_spin_lock_irqsave(&desc->lock, flags); - if (irq_move_pending(desc)) + if (irq_move_pending(&desc->irq_data)) irq_get_pending(cpumask, desc); else cpumask_copy(cpumask, desc->irq_data.affinity); @@ -285,10 +296,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) if (cpumask_intersects(desc->irq_data.affinity, cpu_online_mask)) set = desc->irq_data.affinity; - else { - irq_compat_clr_affinity(desc); + else irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); - } } cpumask_and(mask, cpu_online_mask, set); @@ -551,9 +560,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, flags &= IRQ_TYPE_SENSE_MASK; if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { - if (!(desc->istate & IRQS_MASKED)) + if (!irqd_irq_masked(&desc->irq_data)) mask_irq(desc); - if (!(desc->istate & IRQS_DISABLED)) + if (!irqd_irq_disabled(&desc->irq_data)) unmask = 1; } @@ -575,8 +584,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, irqd_set(&desc->irq_data, IRQD_LEVEL); } - if (chip != desc->irq_data.chip) - irq_chip_set_defaults(desc->irq_data.chip); ret = 0; break; default: @@ -651,7 +658,7 @@ again: * irq_wake_thread(). See the comment there which explains the * serialization. */ - if (unlikely(desc->istate & IRQS_INPROGRESS)) { + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); cpu_relax(); @@ -668,12 +675,10 @@ again: desc->threads_oneshot &= ~action->thread_mask; - if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) && - (desc->istate & IRQS_MASKED)) { - irq_compat_clr_masked(desc); - desc->istate &= ~IRQS_MASKED; - desc->irq_data.chip->irq_unmask(&desc->irq_data); - } + if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data)) + unmask_irq(desc); + out_unlock: raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); @@ -767,7 +772,7 @@ static int irq_thread(void *data) atomic_inc(&desc->threads_active); raw_spin_lock_irq(&desc->lock); - if (unlikely(desc->istate & IRQS_DISABLED)) { + if (unlikely(irqd_irq_disabled(&desc->irq_data))) { /* * CHECKME: We might need a dedicated * IRQ_THREAD_PENDING flag here, which @@ -775,7 +780,6 @@ static int irq_thread(void *data) * but AFAICT IRQS_PENDING should be fine as it * retriggers the interrupt itself --- tglx */ - irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; raw_spin_unlock_irq(&desc->lock); } else { @@ -971,8 +975,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->thread_mask = 1 << ffz(thread_mask); if (!shared) { - irq_chip_set_defaults(desc->irq_data.chip); - init_waitqueue_head(&desc->wait_for_threads); /* Setup the type (level, edge polarity) if configured: */ @@ -985,8 +987,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ - IRQS_INPROGRESS | IRQS_ONESHOT | \ - IRQS_WAITING); + IRQS_ONESHOT | IRQS_WAITING); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); if (new->flags & IRQF_PERCPU) { irqd_set(&desc->irq_data, IRQD_PER_CPU); @@ -1049,6 +1051,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); + free_cpumask_var(mask); return 0; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index ec4806d..4742090 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -35,7 +35,7 @@ void irq_move_masked_irq(struct irq_data *idata) * do the disable, re-program, enable sequence. * This is *not* particularly important for level triggered * but in a edge trigger case, we might be setting rte - * when an active trigger is comming in. This could + * when an active trigger is coming in. This could * cause some ioapics to mal-function. * Being paranoid i guess! * @@ -53,20 +53,14 @@ void irq_move_masked_irq(struct irq_data *idata) cpumask_clear(desc->pending_mask); } -void move_masked_irq(int irq) -{ - irq_move_masked_irq(irq_get_irq_data(irq)); -} - void irq_move_irq(struct irq_data *idata) { - struct irq_desc *desc = irq_data_to_desc(idata); bool masked; if (likely(!irqd_is_setaffinity_pending(idata))) return; - if (unlikely(desc->istate & IRQS_DISABLED)) + if (unlikely(irqd_irq_disabled(idata))) return; /* @@ -74,15 +68,10 @@ void irq_move_irq(struct irq_data *idata) * threaded interrupt with ONESHOT set, we can end up with an * interrupt storm. */ - masked = desc->istate & IRQS_MASKED; + masked = irqd_irq_masked(idata); if (!masked) idata->chip->irq_mask(idata); irq_move_masked_irq(idata); if (!masked) idata->chip->irq_unmask(idata); } - -void move_native_irq(int irq) -{ - irq_move_irq(irq_get_irq_data(irq)); -} diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 760248d..dd201bd 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -364,6 +364,10 @@ int __weak arch_show_interrupts(struct seq_file *p, int prec) return 0; } +#ifndef ACTUAL_NR_IRQS +# define ACTUAL_NR_IRQS nr_irqs +#endif + int show_interrupts(struct seq_file *p, void *v) { static int prec; @@ -373,10 +377,10 @@ int show_interrupts(struct seq_file *p, void *v) struct irqaction *action; struct irq_desc *desc; - if (i > nr_irqs) + if (i > ACTUAL_NR_IRQS) return 0; - if (i == nr_irqs) + if (i == ACTUAL_NR_IRQS) return arch_show_interrupts(p, prec); /* print header and calculate the width of the first column */ @@ -404,7 +408,20 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, "%*d: ", prec, i); for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); - seq_printf(p, " %8s", desc->irq_data.chip->name); + + if (desc->irq_data.chip) { + if (desc->irq_data.chip->irq_print_chip) + desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); + else if (desc->irq_data.chip->name) + seq_printf(p, " %8s", desc->irq_data.chip->name); + else + seq_printf(p, " %8s", "-"); + } else { + seq_printf(p, " %8s", "None"); + } +#ifdef CONFIG_GENIRC_IRQ_SHOW_LEVEL + seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); +#endif if (desc->name) seq_printf(p, "-%-8s", desc->name); diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index ad683a9..14dd576 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -65,7 +65,6 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if (desc->istate & IRQS_REPLAY) return; if (desc->istate & IRQS_PENDING) { - irq_compat_clr_pending(desc); desc->istate &= ~IRQS_PENDING; desc->istate |= IRQS_REPLAY; diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 0227ad3..0d91730 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -15,17 +15,8 @@ enum { _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; -#define IRQ_INPROGRESS GOT_YOU_MORON -#define IRQ_REPLAY GOT_YOU_MORON -#define IRQ_WAITING GOT_YOU_MORON -#define IRQ_DISABLED GOT_YOU_MORON -#define IRQ_PENDING GOT_YOU_MORON -#define IRQ_MASKED GOT_YOU_MORON -#define IRQ_WAKEUP GOT_YOU_MORON -#define IRQ_MOVE_PENDING GOT_YOU_MORON #define IRQ_PER_CPU GOT_YOU_MORON #define IRQ_NO_BALANCING GOT_YOU_MORON -#define IRQ_AFFINITY_SET GOT_YOU_MORON #define IRQ_LEVEL GOT_YOU_MORON #define IRQ_NOPROBE GOT_YOU_MORON #define IRQ_NOREQUEST GOT_YOU_MORON @@ -37,102 +28,98 @@ enum { static inline void irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) { - desc->status &= ~(clr & _IRQF_MODIFY_MASK); - desc->status |= (set & _IRQF_MODIFY_MASK); + desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK); + desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK); } static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) { - return desc->status & _IRQ_PER_CPU; + return desc->status_use_accessors & _IRQ_PER_CPU; } static inline void irq_settings_set_per_cpu(struct irq_desc *desc) { - desc->status |= _IRQ_PER_CPU; + desc->status_use_accessors |= _IRQ_PER_CPU; } static inline void irq_settings_set_no_balancing(struct irq_desc *desc) { - desc->status |= _IRQ_NO_BALANCING; + desc->status_use_accessors |= _IRQ_NO_BALANCING; } static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) { - return desc->status & _IRQ_NO_BALANCING; + return desc->status_use_accessors & _IRQ_NO_BALANCING; } static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) { - return desc->status & IRQ_TYPE_SENSE_MASK; + return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK; } static inline void irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask) { - desc->status &= ~IRQ_TYPE_SENSE_MASK; - desc->status |= mask & IRQ_TYPE_SENSE_MASK; + desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK; + desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK; } static inline bool irq_settings_is_level(struct irq_desc *desc) { - return desc->status & _IRQ_LEVEL; + return desc->status_use_accessors & _IRQ_LEVEL; } static inline void irq_settings_clr_level(struct irq_desc *desc) { - desc->status &= ~_IRQ_LEVEL; + desc->status_use_accessors &= ~_IRQ_LEVEL; } static inline void irq_settings_set_level(struct irq_desc *desc) { - desc->status |= _IRQ_LEVEL; + desc->status_use_accessors |= _IRQ_LEVEL; } static inline bool irq_settings_can_request(struct irq_desc *desc) { - return !(desc->status & _IRQ_NOREQUEST); + return !(desc->status_use_accessors & _IRQ_NOREQUEST); } static inline void irq_settings_clr_norequest(struct irq_desc *desc) { - desc->status &= ~_IRQ_NOREQUEST; + desc->status_use_accessors &= ~_IRQ_NOREQUEST; } static inline void irq_settings_set_norequest(struct irq_desc *desc) { - desc->status |= _IRQ_NOREQUEST; + desc->status_use_accessors |= _IRQ_NOREQUEST; } static inline bool irq_settings_can_probe(struct irq_desc *desc) { - return !(desc->status & _IRQ_NOPROBE); + return !(desc->status_use_accessors & _IRQ_NOPROBE); } static inline void irq_settings_clr_noprobe(struct irq_desc *desc) { - desc->status &= ~_IRQ_NOPROBE; + desc->status_use_accessors &= ~_IRQ_NOPROBE; } static inline void irq_settings_set_noprobe(struct irq_desc *desc) { - desc->status |= _IRQ_NOPROBE; + desc->status_use_accessors |= _IRQ_NOPROBE; } static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) { - return desc->status & _IRQ_MOVE_PCNTXT; + return desc->status_use_accessors & _IRQ_MOVE_PCNTXT; } static inline bool irq_settings_can_autoenable(struct irq_desc *desc) { - return !(desc->status & _IRQ_NOAUTOEN); + return !(desc->status_use_accessors & _IRQ_NOAUTOEN); } static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) { - return desc->status & _IRQ_NESTED_THREAD; + return desc->status_use_accessors & _IRQ_NESTED_THREAD; } - -/* Nothing should touch desc->status from now on */ -#undef status -#define status USE_THE_PROPER_WRAPPERS_YOU_MORON diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dd586eb..dfbd550 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -45,12 +45,12 @@ bool irq_wait_for_poll(struct irq_desc *desc) #ifdef CONFIG_SMP do { raw_spin_unlock(&desc->lock); - while (desc->istate & IRQS_INPROGRESS) + while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); raw_spin_lock(&desc->lock); - } while (desc->istate & IRQS_INPROGRESS); + } while (irqd_irq_inprogress(&desc->irq_data)); /* Might have been disabled in meantime */ - return !(desc->istate & IRQS_DISABLED) && desc->action; + return !irqd_irq_disabled(&desc->irq_data) && desc->action; #else return false; #endif @@ -75,7 +75,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) * Do not poll disabled interrupts unless the spurious * disabled poller asks explicitely. */ - if ((desc->istate & IRQS_DISABLED) && !force) + if (irqd_irq_disabled(&desc->irq_data) && !force) goto out; /* @@ -88,12 +88,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) goto out; /* Already running on another processor */ - if (desc->istate & IRQS_INPROGRESS) { + if (irqd_irq_inprogress(&desc->irq_data)) { /* * Already running: If it is shared get the other * CPU to go looking for our mystery interrupt too */ - irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; goto out; } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6f6d091..079f1d3 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr) if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || arch_is_kernel_text(addr)) return 1; - return in_gate_area_no_task(addr); + return in_gate_area_no_mm(addr); } static inline int is_kernel(unsigned long addr) { if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) return 1; - return in_gate_area_no_task(addr); + return in_gate_area_no_mm(addr); } static int is_ksym_addr(unsigned long addr) @@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, } /* Look up a kernel symbol and return it in a text buffer. */ -int sprint_symbol(char *buffer, unsigned long address) +static int __sprint_symbol(char *buffer, unsigned long address, + int symbol_offset) { char *modname; const char *name; unsigned long offset, size; int len; + address += symbol_offset; name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) return sprintf(buffer, "0x%lx", address); @@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address) strcpy(buffer, name); len = strlen(buffer); buffer += len; + offset -= symbol_offset; if (modname) - len += sprintf(buffer, "+%#lx/%#lx [%s]", - offset, size, modname); + len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); else len += sprintf(buffer, "+%#lx/%#lx", offset, size); return len; } + +/** + * sprint_symbol - Look up a kernel symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function looks up a kernel symbol with @address and stores its name, + * offset, size and module name to @buffer if possible. If no symbol was found, + * just saves its @address as is. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_symbol(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, 0); +} + EXPORT_SYMBOL_GPL(sprint_symbol); +/** + * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function is for stack backtrace and does the same thing as + * sprint_symbol() but with modified/decreased @address. If there is a + * tail-call to the function marked "noreturn", gcc optimized out code after + * the call so that the stack-saved return address could point outside of the + * caller. This function ensures that kallsyms will find the original caller + * by decreasing @address. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_backtrace(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, -1); +} + /* Look up a kernel symbol and print it to the kernel messages. */ void __print_symbol(const char *fmt, unsigned long address) { @@ -477,13 +515,11 @@ static int s_show(struct seq_file *m, void *p) */ type = iter->exported ? toupper(iter->type) : tolower(iter->type); - seq_printf(m, "%0*lx %c %s\t[%s]\n", - (int)(2 * sizeof(void *)), - iter->value, type, iter->name, iter->module_name); + seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value, + type, iter->name, iter->module_name); } else - seq_printf(m, "%0*lx %c %s\n", - (int)(2 * sizeof(void *)), - iter->value, iter->type, iter->name); + seq_printf(m, "%pK %c %s\n", (void *)iter->value, + iter->type, iter->name); return 0; } diff --git a/kernel/kexec.c b/kernel/kexec.c index ec19b92..87b77de 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -33,6 +33,7 @@ #include <linux/vmalloc.h> #include <linux/swap.h> #include <linux/kmsg_dump.h> +#include <linux/syscore_ops.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -144,7 +145,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, /* Initialize the list of destination pages */ INIT_LIST_HEAD(&image->dest_pages); - /* Initialize the list of unuseable pages */ + /* Initialize the list of unusable pages */ INIT_LIST_HEAD(&image->unuseable_pages); /* Read in the segments */ @@ -454,7 +455,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, /* Deal with the destination pages I have inadvertently allocated. * * Ideally I would convert multi-page allocations into single - * page allocations, and add everyting to image->dest_pages. + * page allocations, and add everything to image->dest_pages. * * For now it is simpler to just free the pages. */ @@ -602,7 +603,7 @@ static void kimage_free_extra_pages(struct kimage *image) /* Walk through and free any extra destination pages I may have */ kimage_free_page_list(&image->dest_pages); - /* Walk through and free any unuseable pages I have cached */ + /* Walk through and free any unusable pages I have cached */ kimage_free_page_list(&image->unuseable_pages); } @@ -1099,7 +1100,8 @@ size_t crash_get_memory_size(void) return size; } -static void free_reserved_phys_range(unsigned long begin, unsigned long end) +void __weak crash_free_reserved_phys_range(unsigned long begin, + unsigned long end) { unsigned long addr; @@ -1135,7 +1137,7 @@ int crash_shrink_memory(unsigned long new_size) start = roundup(start, PAGE_SIZE); end = roundup(start + new_size, PAGE_SIZE); - free_reserved_phys_range(end, crashk_res.end); + crash_free_reserved_phys_range(end, crashk_res.end); if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); @@ -1531,6 +1533,11 @@ int kernel_kexec(void) local_irq_disable(); /* Suspend system devices */ error = sysdev_suspend(PMSG_FREEZE); + if (!error) { + error = syscore_suspend(); + if (error) + sysdev_resume(); + } if (error) goto Enable_irqs; } else @@ -1545,6 +1552,7 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { + syscore_resume(); sysdev_resume(); Enable_irqs: local_irq_enable(); diff --git a/kernel/kthread.c b/kernel/kthread.c index c55afba..3b34d27 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -27,6 +27,7 @@ struct kthread_create_info /* Information passed to kthread() from kthreadd. */ int (*threadfn)(void *data); void *data; + int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; @@ -98,10 +99,23 @@ static int kthread(void *_create) do_exit(ret); } +/* called from do_fork() to get node information for about to be created task */ +int tsk_fork_get_node(struct task_struct *tsk) +{ +#ifdef CONFIG_NUMA + if (tsk == kthreadd_task) + return tsk->pref_node_fork; +#endif + return numa_node_id(); +} + static void create_kthread(struct kthread_create_info *create) { int pid; +#ifdef CONFIG_NUMA + current->pref_node_fork = create->node; +#endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { @@ -111,33 +125,38 @@ static void create_kthread(struct kthread_create_info *create) } /** - * kthread_create - create a kthread. + * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. + * @node: memory node number. * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). * + * If thread is going to be bound on a particular cpu, give its node + * in @node, to get NUMA affinity for kthread stack, or else give -1. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either call do_exit() directly if it is a - * standalone thread for which noone will call kthread_stop(), or + * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * * Returns a task_struct or ERR_PTR(-ENOMEM). */ -struct task_struct *kthread_create(int (*threadfn)(void *data), - void *data, - const char namefmt[], - ...) +struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), + void *data, + int node, + const char namefmt[], + ...) { struct kthread_create_info create; create.threadfn = threadfn; create.data = data; + create.node = node; init_completion(&create.done); spin_lock(&kthread_create_lock); @@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } return create.result; } -EXPORT_SYMBOL(kthread_create); +EXPORT_SYMBOL(kthread_create_on_node); /** * kthread_bind - bind a just-created kthread to a cpu. diff --git a/kernel/latencytop.c b/kernel/latencytop.c index ee74b35..376066e 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk, } /** - * __account_scheduler_latency - record an occured latency + * __account_scheduler_latency - record an occurred latency * @tsk - the task struct of the task hitting the latency * @usecs - the duration of the latency in microseconds * @inter - 1 if the sleep was interruptible, 0 if uninterruptible diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0d2058d..53a6895 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2309,7 +2309,7 @@ void trace_hardirqs_on_caller(unsigned long ip) if (unlikely(curr->hardirqs_enabled)) { /* * Neither irq nor preemption are disabled here - * so this is racy by nature but loosing one hit + * so this is racy by nature but losing one hit * in a stat is not a big deal. */ __debug_atomic_inc(redundant_hardirqs_on); @@ -2620,7 +2620,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, if (!graph_lock()) return 0; /* - * Make sure we didnt race: + * Make sure we didn't race: */ if (unlikely(hlock_class(this)->usage_mask & new_mask)) { graph_unlock(); diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 1969d2f..71edd2f 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, - sum_forward_deps = 0, factor = 0; + sum_forward_deps = 0; list_for_each_entry(class, &all_lock_classes, lock_entry) { @@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_hardirq_unsafe * nr_hardirq_safe + nr_list_entries); - /* - * Estimated factor between direct and indirect - * dependencies: - */ - if (nr_list_entries) - factor = sum_forward_deps / nr_list_entries; - #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", nr_lock_chains, MAX_LOCKDEP_CHAINS); diff --git a/kernel/module.c b/kernel/module.c index efa290e..d5938a5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -809,7 +809,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, wait_for_zero_refcount(mod); mutex_unlock(&module_mutex); - /* Final destruction now noone is using it. */ + /* Final destruction now no one is using it. */ if (mod->exit != NULL) mod->exit(); blocking_notifier_call_chain(&module_notify_list, @@ -1168,7 +1168,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr, { struct module_sect_attr *sattr = container_of(mattr, struct module_sect_attr, mattr); - return sprintf(buf, "0x%lx\n", sattr->address); + return sprintf(buf, "0x%pK\n", (void *)sattr->address); } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -2777,7 +2777,7 @@ static struct module *load_module(void __user *umod, mod->state = MODULE_STATE_COMING; /* Now sew it into the lists so we can get lockdep and oops - * info during argument parsing. Noone should access us, since + * info during argument parsing. No one should access us, since * strong_try_module_get() will fail. * lockdep/oops can run asynchronous, so use the RCU list insertion * function to insert in a way safe to concurrent readers. @@ -2971,7 +2971,7 @@ static const char *get_ksymbol(struct module *mod, else nextval = (unsigned long)mod->module_core+mod->core_text_size; - /* Scan for closest preceeding symbol, and next symbol. (ELF + /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ for (i = 1; i < mod->num_symtab; i++) { if (mod->symtab[i].st_shndx == SHN_UNDEF) @@ -3224,7 +3224,7 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading": "Live"); /* Used by oprofile and other similar tools. */ - seq_printf(m, " 0x%p", mod->module_core); + seq_printf(m, " 0x%pK", mod->module_core); /* Taints info */ if (mod->taints) diff --git a/kernel/mutex.c b/kernel/mutex.c index a5889fb..c4195fa 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -245,7 +245,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } __set_task_state(task, state); - /* didnt get the lock, go to sleep: */ + /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); preempt_enable_no_resched(); schedule(); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f74e6c0..a05d191 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -69,13 +69,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_ns; } - new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); + new_nsp->uts_ns = copy_utsname(flags, tsk); if (IS_ERR(new_nsp->uts_ns)) { err = PTR_ERR(new_nsp->uts_ns); goto out_uts; } - new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); + new_nsp->ipc_ns = copy_ipcs(flags, tsk); if (IS_ERR(new_nsp->ipc_ns)) { err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; diff --git a/kernel/padata.c b/kernel/padata.c index 7510194..b91941d 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd) /* * This cpu has to do the parallel processing of the next * object. It's waiting in the cpu's parallelization queue, - * so exit imediately. + * so exit immediately. */ if (PTR_ERR(padata) == -ENODATA) { del_timer(&pd->timer); @@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd) /* * The next object that needs serialization might have arrived to * the reorder queues in the meantime, we will be called again - * from the timer function if noone else cares for it. + * from the timer function if no one else cares for it. */ if (atomic_read(&pd->reorder_objects) && !(pinst->flags & PADATA_RESET)) @@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst) put_online_cpus(); } -/* Replace the internal control stucture with a new one. */ +/* Replace the internal control structure with a new one. */ static void padata_replace(struct padata_instance *pinst, struct parallel_data *pd_new) { @@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) } /** - * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) + * padata_remove_cpu - remove a cpu from the one or both(serial and parallel) * padata cpumasks. * * @pinst: padata instance diff --git a/kernel/panic.c b/kernel/panic.c index 991bb87..6923167 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); diff --git a/kernel/params.c b/kernel/params.c index 0da1411..7ab388a 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -95,7 +95,7 @@ static int parse_one(char *param, /* Find parameter */ for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { - /* Noone handled NULL, so do it here. */ + /* No one handled NULL, so do it here. */ if (!val && params[i].ops->set != param_set_bool) return -EINVAL; DEBUGP("They are equal! Calling %p\n", diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3472bb1..8e81a98 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -145,7 +145,8 @@ static struct srcu_struct pmus_srcu; */ int sysctl_perf_event_paranoid __read_mostly = 1; -int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ +/* Minimum for 512 kiB + 1 user control page */ +int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ /* * max perf event sample rate @@ -363,6 +364,7 @@ void perf_cgroup_switch(struct task_struct *task, int mode) } if (mode & PERF_CGROUP_SWIN) { + WARN_ON_ONCE(cpuctx->cgrp); /* set cgrp before ctxsw in to * allow event_filter_match() to not * have to pass task around @@ -941,6 +943,7 @@ static void perf_group_attach(struct perf_event *event) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { + struct perf_cpu_context *cpuctx; /* * We can have double detach due to exit/hot-unplug + close. */ @@ -949,8 +952,17 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - if (is_cgroup_event(event)) + if (is_cgroup_event(event)) { ctx->nr_cgroups--; + cpuctx = __get_cpu_context(ctx); + /* + * if there are no more cgroup events + * then cler cgrp to avoid stale pointer + * in update_cgrp_time_from_cpuctx() + */ + if (!ctx->nr_cgroups) + cpuctx->cgrp = NULL; + } ctx->nr_events--; if (event->attr.inherit_stat) @@ -2412,6 +2424,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) if (!ctx || !ctx->nr_events) goto out; + /* + * We must ctxsw out cgroup events to avoid conflict + * when invoking perf_task_event_sched_in() later on + * in this function. Otherwise we end up trying to + * ctxswin cgroup events which are already scheduled + * in. + */ + perf_cgroup_sched_out(current); task_ctx_sched_out(ctx, EVENT_ALL); raw_spin_lock(&ctx->lock); @@ -2436,6 +2456,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); + /* + * Also calls ctxswin for cgroup events, if any: + */ perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); @@ -6520,6 +6543,11 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } + if (task) { + put_task_struct(task); + task = NULL; + } + /* * Look up the group leader (we will attach this event to it): */ diff --git a/kernel/pid.c b/kernel/pid.c index 02f2212..57a8346 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) return -1; } -int next_pidmap(struct pid_namespace *pid_ns, int last) +int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) { int offset; struct pidmap *map, *end; + if (last >= PID_MAX_LIMIT) + return -1; + offset = (last + 1) & BITS_PER_PAGE_MASK; map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; end = &pid_ns->pidmap[PIDMAP_ENTRIES]; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a5aff94..e9c9adc 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -14,6 +14,7 @@ #include <linux/err.h> #include <linux/acct.h> #include <linux/slab.h> +#include <linux/proc_fs.h> #define BITS_PER_PAGE (PAGE_SIZE*8) @@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; - int i; + int i, err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) @@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); + err = pid_ns_prepare_proc(ns); + if (err) + goto out_put_parent_pid_ns; + return ns; +out_put_parent_pid_ns: + put_pid_ns(parent_pid_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: kmem_cache_free(pid_ns_cachep, ns); out: - return ERR_PTR(-ENOMEM); + return ERR_PTR(err); } static void destroy_pid_namespace(struct pid_namespace *ns) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 67fea9d..0791b13 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1347,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) /* * Now that all the timers on our list have the firing flag, - * noone will touch their list entries but us. We'll take + * no one will touch their list entries but us. We'll take * each timer's lock before clearing its firing flag, so no * timer call will interfere. */ diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 4c01249..e5498d7 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -313,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr) * restarted (i.e. we have flagged this in the sys_private entry of the * info block). * - * To protect aginst the timer going away while the interrupt is queued, + * To protect against the timer going away while the interrupt is queued, * we require that the it_requeue_pending flag be set. */ void do_schedule_next_timer(struct siginfo *info) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 4603f08..6de9a8f 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -18,9 +18,13 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. +config HIBERNATE_CALLBACKS + bool + config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on SWAP && ARCH_HIBERNATION_POSSIBLE + select HIBERNATE_CALLBACKS select LZO_COMPRESS select LZO_DECOMPRESS ---help--- @@ -85,7 +89,7 @@ config PM_STD_PARTITION config PM_SLEEP def_bool y - depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE + depends on SUSPEND || HIBERNATE_CALLBACKS config PM_SLEEP_SMP def_bool y diff --git a/kernel/power/Makefile b/kernel/power/Makefile index c350e18..c5ebc6a 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,4 +1,5 @@ -ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG + +ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG obj-$(CONFIG_PM) += main.o obj-$(CONFIG_PM_SLEEP) += console.o diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c index 83bbc7c..d09dd10 100644 --- a/kernel/power/block_io.c +++ b/kernel/power/block_io.c @@ -28,7 +28,7 @@ static int submit(int rw, struct block_device *bdev, sector_t sector, struct page *page, struct bio **bio_chain) { - const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; + const int bio_rw = rw | REQ_SYNC; struct bio *bio; bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index aeabd26..50aae66 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -273,8 +273,11 @@ static int create_image(int platform_mode) local_irq_disable(); error = sysdev_suspend(PMSG_FREEZE); - if (!error) + if (!error) { error = syscore_suspend(); + if (error) + sysdev_resume(); + } if (error) { printk(KERN_ERR "PM: Some system devices failed to power down, " "aborting hibernation\n"); @@ -407,8 +410,11 @@ static int resume_target_kernel(bool platform_mode) local_irq_disable(); error = sysdev_suspend(PMSG_QUIESCE); - if (!error) + if (!error) { error = syscore_suspend(); + if (error) + sysdev_resume(); + } if (error) goto Enable_irqs; diff --git a/kernel/power/main.c b/kernel/power/main.c index 8eaba5f..de9aef8 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -224,7 +224,7 @@ power_attr(state); * writing to 'state'. It first should read from 'wakeup_count' and store * the read value. Then, after carrying out its own preparations for the system * transition to a sleep state, it should write the stored value to - * 'wakeup_count'. If that fails, at least one wakeup event has occured since + * 'wakeup_count'. If that fails, at least one wakeup event has occurred since * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it * is allowed to write to 'state', but the transition will be aborted if there * are any wakeup events detected after 'wakeup_count' was written to. diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 2814c32..8935369 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -164,8 +164,11 @@ static int suspend_enter(suspend_state_t state) BUG_ON(!irqs_disabled()); error = sysdev_suspend(PMSG_SUSPEND); - if (!error) + if (!error) { error = syscore_suspend(); + if (error) + sysdev_resume(); + } if (!error) { if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { error = suspend_ops->enter(state); diff --git a/kernel/printk.c b/kernel/printk.c index 33284ad..da8ca81 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -53,7 +53,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) /* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ +#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL /* We show everything that is MORE important than this.. */ #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ @@ -113,6 +113,11 @@ static unsigned con_start; /* Index into log_buf: next char to be sent to consol static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ /* + * If exclusive_console is non-NULL then only this console is to be printed to. + */ +static struct console *exclusive_console; + +/* * Array of consoles built from command line options (console=) */ struct console_cmdline @@ -476,6 +481,8 @@ static void __call_console_drivers(unsigned start, unsigned end) struct console *con; for_each_console(con) { + if (exclusive_console && con != exclusive_console) + continue; if ((con->flags & CON_ENABLED) && con->write && (cpu_online(smp_processor_id()) || (con->flags & CON_ANYTIME))) @@ -1230,6 +1237,11 @@ void console_unlock(void) local_irq_restore(flags); } console_locked = 0; + + /* Release the exclusive_console once it is used */ + if (unlikely(exclusive_console)) + exclusive_console = NULL; + up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd) @@ -1316,6 +1328,18 @@ void console_start(struct console *console) } EXPORT_SYMBOL(console_start); +static int __read_mostly keep_bootcon; + +static int __init keep_bootcon_setup(char *str) +{ + keep_bootcon = 1; + printk(KERN_INFO "debug: skip boot console de-registration.\n"); + + return 0; +} + +early_param("keep_bootcon", keep_bootcon_setup); + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -1452,6 +1476,12 @@ void register_console(struct console *newcon) spin_lock_irqsave(&logbuf_lock, flags); con_start = log_start; spin_unlock_irqrestore(&logbuf_lock, flags); + /* + * We're about to replay the log buffer. Only do this to the + * just-registered console to avoid excessive message spam to + * the already-registered consoles. + */ + exclusive_console = newcon; } console_unlock(); console_sysfs_notify(); @@ -1463,7 +1493,9 @@ void register_console(struct console *newcon) * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { + if (bcon && + ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && + !keep_bootcon) { /* we need to iterate through twice, to make sure we print * everything out, before we unregister the console(s) */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e2302e4..0fc1eed 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -134,21 +134,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) return 0; rcu_read_lock(); tcred = __task_cred(task); - if ((cred->uid != tcred->euid || - cred->uid != tcred->suid || - cred->uid != tcred->uid || - cred->gid != tcred->egid || - cred->gid != tcred->sgid || - cred->gid != tcred->gid) && - !capable(CAP_SYS_PTRACE)) { - rcu_read_unlock(); - return -EPERM; - } + if (cred->user->user_ns == tcred->user->user_ns && + (cred->uid == tcred->euid && + cred->uid == tcred->suid && + cred->uid == tcred->uid && + cred->gid == tcred->egid && + cred->gid == tcred->sgid && + cred->gid == tcred->gid)) + goto ok; + if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE)) + goto ok; + rcu_read_unlock(); + return -EPERM; +ok: rcu_read_unlock(); smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - if (!dumpable && !capable(CAP_SYS_PTRACE)) + if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) return -EPERM; return security_ptrace_access_check(task, mode); @@ -198,7 +201,7 @@ static int ptrace_attach(struct task_struct *task) goto unlock_tasklist; task->ptrace = PT_PTRACED; - if (capable(CAP_SYS_PTRACE)) + if (task_ns_capable(task, CAP_SYS_PTRACE)) task->ptrace |= PT_PTRACE_CAP; __ptrace_link(task, current); diff --git a/kernel/res_counter.c b/kernel/res_counter.c index c7eaa37..34683ef 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member, pos, buf, s - buf); } +#if BITS_PER_LONG == 32 +u64 res_counter_read_u64(struct res_counter *counter, int member) +{ + unsigned long flags; + u64 ret; + + spin_lock_irqsave(&counter->lock, flags); + ret = *res_counter_member(counter, member); + spin_unlock_irqrestore(&counter->lock, flags); + + return ret; +} +#else u64 res_counter_read_u64(struct res_counter *counter, int member) { return *res_counter_member(counter, member); } +#endif int res_counter_memparse_write_strategy(const char *buf, unsigned long long *res) diff --git a/kernel/sched.c b/kernel/sched.c index a172494..312f8b9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2309,7 +2309,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * Cause a process which is running on another CPU to enter * kernel-mode, without any delay. (to get signals handled.) * - * NOTE: this function doesnt have to take the runqueue lock, + * NOTE: this function doesn't have to take the runqueue lock, * because all it wants to ensure is that the remote task enters * the kernel. If the IPI races and the task has been migrated * to another CPU then no harm is done and the purpose has been @@ -4111,6 +4111,16 @@ need_resched: try_to_wake_up_local(to_wakeup); } deactivate_task(rq, prev, DEQUEUE_SLEEP); + + /* + * If we are going to sleep and we have plugged IO queued, make + * sure to submit it to avoid deadlocks. + */ + if (blk_needs_flush_plug(prev)) { + raw_spin_unlock(&rq->lock); + blk_schedule_flush_plug(prev); + raw_spin_lock(&rq->lock); + } } switch_count = &prev->nvcsw; } @@ -4892,8 +4902,11 @@ static bool check_same_owner(struct task_struct *p) rcu_read_lock(); pcred = __task_cred(p); - match = (cred->euid == pcred->euid || - cred->euid == pcred->uid); + if (cred->user->user_ns == pcred->user->user_ns) + match = (cred->euid == pcred->euid || + cred->euid == pcred->uid); + else + match = false; rcu_read_unlock(); return match; } @@ -4984,7 +4997,7 @@ recheck: */ raw_spin_lock_irqsave(&p->pi_lock, flags); /* - * To be able to change p->policy safely, the apropriate + * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ rq = __task_rq_lock(p); @@ -4998,6 +5011,17 @@ recheck: return -EINVAL; } + /* + * If not changing anything there's no need to proceed further: + */ + if (unlikely(policy == p->policy && (!rt_policy(policy) || + param->sched_priority == p->rt_priority))) { + + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return 0; + } + #ifdef CONFIG_RT_GROUP_SCHED if (user) { /* @@ -5221,7 +5245,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) + if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p); @@ -5460,6 +5484,8 @@ EXPORT_SYMBOL(yield); * yield_to - yield the current processor to another thread in * your thread group, or accelerate that thread toward the * processor it's on. + * @p: target task + * @preempt: whether task preemption is allowed or not * * It's the caller's job to ensure that the target task struct * can't go away on us before we can do any checks. @@ -5525,6 +5551,7 @@ void __sched io_schedule(void) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + blk_flush_plug(current); current->in_iowait = 1; schedule(); current->in_iowait = 0; @@ -5540,6 +5567,7 @@ long __sched io_schedule_timeout(long timeout) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + blk_flush_plug(current); current->in_iowait = 1; ret = schedule_timeout(timeout); current->in_iowait = 0; @@ -5688,7 +5716,7 @@ void show_state_filter(unsigned long state_filter) do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow - * console might take alot of time: + * console might take a lot of time: */ touch_nmi_watchdog(); if (!state_filter || (p->state & state_filter)) @@ -6303,6 +6331,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #endif } + + update_max_interval(); + return NOTIFY_OK; } @@ -8434,7 +8465,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct cfs_rq *cfs_rq; struct sched_entity *se; - struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8447,8 +8477,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 5946ac5..429242f 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -179,7 +179,7 @@ void sched_autogroup_create_attach(struct task_struct *p) struct autogroup *ag = autogroup_create(); autogroup_move_group(p, ag); - /* drop extra refrence added by autogroup_create() */ + /* drop extra reference added by autogroup_create() */ autogroup_kref_put(ag); } EXPORT_SYMBOL(sched_autogroup_create_attach); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3f7ec9e..6fa833a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -22,6 +22,7 @@ #include <linux/latencytop.h> #include <linux/sched.h> +#include <linux/cpumask.h> /* * Targeted preemption latency for CPU-bound tasks: @@ -2103,21 +2104,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, struct cfs_rq *busiest_cfs_rq) { - int loops = 0, pulled = 0, pinned = 0; + int loops = 0, pulled = 0; long rem_load_move = max_load_move; struct task_struct *p, *n; if (max_load_move == 0) goto out; - pinned = 1; - list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { if (loops++ > sysctl_sched_nr_migrate) break; if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) + !can_migrate_task(p, busiest, this_cpu, sd, idle, + all_pinned)) continue; pull_task(busiest, p, this_rq, this_cpu); @@ -2152,9 +2152,6 @@ out: */ schedstat_add(sd, lb_gained[idle], pulled); - if (all_pinned) - *all_pinned = pinned; - return max_load_move - rem_load_move; } @@ -3061,7 +3058,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, /* * if *imbalance is less than the average load per runnable task - * there is no gaurantee that any tasks will be moved so we'll have + * there is no guarantee that any tasks will be moved so we'll have * a think about bumping its value to force at least one task to be * moved */ @@ -3126,6 +3123,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; + sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; + /* * If the busiest group is imbalanced the below checks don't * work because they assumes all things are equal, which typically @@ -3150,7 +3149,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * Don't pull any tasks if this group is already above the domain * average load. */ - sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; if (sds.this_load >= sds.avg_load) goto out_balanced; @@ -3339,6 +3337,7 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ + all_pinned = 1; local_irq_save(flags); double_rq_lock(this_rq, busiest); ld_moved = move_tasks(this_rq, this_cpu, busiest, @@ -3819,6 +3818,17 @@ void select_nohz_load_balancer(int stop_tick) static DEFINE_SPINLOCK(balancing); +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + +/* + * Scale the max load_balance interval with the number of CPUs in the system. + * This trades load-balance latency on larger machines for less cross talk. + */ +static void update_max_interval(void) +{ + max_load_balance_interval = HZ*num_online_cpus()/10; +} + /* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. @@ -3848,10 +3858,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); - if (unlikely(!interval)) - interval = 1; - if (interval > HZ*NR_CPUS/10) - interval = HZ*NR_CPUS/10; + interval = clamp(interval, 1UL, max_load_balance_interval); need_serialize = sd->flags & SD_SERIALIZE; diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index c82f26c1..a776a63 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -94,6 +94,4 @@ static const struct sched_class idle_sched_class = { .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, - - /* no .task_new for idle tasks */ }; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index db308cb..e7cebdc 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1378,7 +1378,7 @@ retry: task = pick_next_pushable_task(rq); if (task_cpu(next_task) == rq->cpu && task == next_task) { /* - * If we get here, the task hasnt moved at all, but + * If we get here, the task hasn't moved at all, but * it has failed to push. We will not try again, * since the other cpus will pull from us when they * are ready. @@ -1488,7 +1488,7 @@ static int pull_rt_task(struct rq *this_rq) /* * We continue with the search, just in * case there's an even higher prio task - * in another runqueue. (low likelyhood + * in another runqueue. (low likelihood * but possible) */ } diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 84ec9bc..1ba2bd4 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -102,6 +102,4 @@ static const struct sched_class stop_sched_class = { .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, - - /* no .task_new for stop tasks */ }; diff --git a/kernel/signal.c b/kernel/signal.c index 4e3cff1..7165af5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -226,7 +226,7 @@ static inline void print_dropped_signal(int sig) /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an - * appopriate lock must be held to stop the target task from exiting + * appropriate lock must be held to stop the target task from exiting */ static struct sigqueue * __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) @@ -375,15 +375,15 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tracehook_consider_fatal_signal(tsk, sig); } - -/* Notify the system that a driver wants to block all signals for this +/* + * Notify the system that a driver wants to block all signals for this * process, and wants to be notified if any signals at all were to be * sent/acted upon. If the notifier routine returns non-zero, then the * signal will be acted upon after all. If the notifier routine returns 0, * then then signal will be blocked. Only one block per process is * allowed. priv is a pointer to private data that the notifier routine - * can use to determine if the signal should be blocked or not. */ - + * can use to determine if the signal should be blocked or not. + */ void block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) { @@ -434,9 +434,10 @@ still_pending: copy_siginfo(info, &first->info); __sigqueue_free(first); } else { - /* Ok, it wasn't in the queue. This must be - a fast-pathed signal or we must have been - out of queue space. So zero out the info. + /* + * Ok, it wasn't in the queue. This must be + * a fast-pathed signal or we must have been + * out of queue space. So zero out the info. */ info->si_signo = sig; info->si_errno = 0; @@ -468,7 +469,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, } /* - * Dequeue a signal and return the element to the caller, which is + * Dequeue a signal and return the element to the caller, which is * expected to free it. * * All callers have to hold the siglock. @@ -490,7 +491,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * itimers are process shared and we restart periodic * itimers in the signal delivery path to prevent DoS * attacks in the high resolution timer case. This is - * compliant with the old way of self restarting + * compliant with the old way of self-restarting * itimers, as the SIGALRM is a legacy signal and only * queued once. Changing the restart behaviour to * restart the timer in the signal dequeue path is @@ -636,13 +637,33 @@ static inline bool si_fromuser(const struct siginfo *info) } /* + * called with RCU read lock from check_kill_permission() + */ +static int kill_ok_by_cred(struct task_struct *t) +{ + const struct cred *cred = current_cred(); + const struct cred *tcred = __task_cred(t); + + if (cred->user->user_ns == tcred->user->user_ns && + (cred->euid == tcred->suid || + cred->euid == tcred->uid || + cred->uid == tcred->suid || + cred->uid == tcred->uid)) + return 1; + + if (ns_capable(tcred->user->user_ns, CAP_KILL)) + return 1; + + return 0; +} + +/* * Bad permissions for sending the signal * - the caller must hold the RCU read lock */ static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) { - const struct cred *cred, *tcred; struct pid *sid; int error; @@ -656,14 +677,8 @@ static int check_kill_permission(int sig, struct siginfo *info, if (error) return error; - cred = current_cred(); - tcred = __task_cred(t); if (!same_thread_group(current, t) && - (cred->euid ^ tcred->suid) && - (cred->euid ^ tcred->uid) && - (cred->uid ^ tcred->suid) && - (cred->uid ^ tcred->uid) && - !capable(CAP_KILL)) { + !kill_ok_by_cred(t)) { switch (sig) { case SIGCONT: sid = task_session(t); @@ -909,14 +924,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, if (info == SEND_SIG_FORCED) goto out_set; - /* Real-time signals must be queued if sent by sigqueue, or - some other real-time mechanism. It is implementation - defined whether kill() does so. We attempt to do so, on - the principle of least surprise, but since kill is not - allowed to fail with EAGAIN when low on memory we just - make sure at least one signal gets delivered and don't - pass on the info struct. */ - + /* + * Real-time signals must be queued if sent by sigqueue, or + * some other real-time mechanism. It is implementation + * defined whether kill() does so. We attempt to do so, on + * the principle of least surprise, but since kill is not + * allowed to fail with EAGAIN when low on memory we just + * make sure at least one signal gets delivered and don't + * pass on the info struct. + */ if (sig < SIGRTMIN) override_rlimit = (is_si_special(info) || info->si_code >= 0); else @@ -1187,8 +1203,7 @@ retry: return error; } -int -kill_proc_info(int sig, struct siginfo *info, pid_t pid) +int kill_proc_info(int sig, struct siginfo *info, pid_t pid) { int error; rcu_read_lock(); @@ -1285,8 +1300,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) * These are for backward compatibility with the rest of the kernel source. */ -int -send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { /* * Make sure legacy kernel users don't send in bad values @@ -1354,7 +1368,7 @@ EXPORT_SYMBOL(kill_pid); * These functions support sending signals using preallocated sigqueue * structures. This is needed "because realtime applications cannot * afford to lose notifications of asynchronous events, like timer - * expirations or I/O completions". In the case of Posix Timers + * expirations or I/O completions". In the case of POSIX Timers * we allocate the sigqueue structure from the timer_create. If this * allocation fails we are able to report the failure to the application * with an EAGAIN error. @@ -1539,7 +1553,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) info.si_signo = SIGCHLD; info.si_errno = 0; /* - * see comment in do_notify_parent() abot the following 3 lines + * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); @@ -1597,7 +1611,7 @@ static inline int may_ptrace_stop(void) } /* - * Return nonzero if there is a SIGKILL that should be waking us up. + * Return non-zero if there is a SIGKILL that should be waking us up. * Called with the siglock held. */ static int sigkill_pending(struct task_struct *tsk) @@ -1721,7 +1735,7 @@ void ptrace_notify(int exit_code) /* * This performs the stopping for SIGSTOP and other stop signals. * We have to stop all threads in the thread group. - * Returns nonzero if we've actually stopped and released the siglock. + * Returns non-zero if we've actually stopped and released the siglock. * Returns zero if we didn't stop and still hold the siglock. */ static int do_signal_stop(int signr) @@ -1809,10 +1823,12 @@ static int ptrace_signal(int signr, siginfo_t *info, current->exit_code = 0; - /* Update the siginfo structure if the signal has - changed. If the debugger wanted something - specific in the siginfo structure then it should - have updated *info via PTRACE_SETSIGINFO. */ + /* + * Update the siginfo structure if the signal has + * changed. If the debugger wanted something + * specific in the siginfo structure then it should + * have updated *info via PTRACE_SETSIGINFO. + */ if (signr != info->si_signo) { info->si_signo = signr; info->si_errno = 0; @@ -1871,7 +1887,7 @@ relock: for (;;) { struct k_sigaction *ka; /* - * Tracing can induce an artifical signal and choose sigaction. + * Tracing can induce an artificial signal and choose sigaction. * The return value in @signr determines the default action, * but @info->si_signo is the signal number we will report. */ @@ -2020,7 +2036,8 @@ void exit_signals(struct task_struct *tsk) if (!signal_pending(tsk)) goto out; - /* It could be that __group_complete_signal() choose us to + /* + * It could be that __group_complete_signal() choose us to * notify about group-wide signal. Another thread should be * woken now to take the signal since we will not. */ @@ -2058,6 +2075,9 @@ EXPORT_SYMBOL(unblock_all_signals); * System call entry points. */ +/** + * sys_restart_syscall - restart a system call + */ SYSCALL_DEFINE0(restart_syscall) { struct restart_block *restart = ¤t_thread_info()->restart_block; @@ -2111,6 +2131,13 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) return error; } +/** + * sys_rt_sigprocmask - change the list of currently blocked signals + * @how: whether to add, remove, or set signals + * @set: stores pending signals + * @oset: previous value of signal mask if non-null + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, sigset_t __user *, oset, size_t, sigsetsize) { @@ -2169,8 +2196,14 @@ long do_sigpending(void __user *set, unsigned long sigsetsize) out: return error; -} +} +/** + * sys_rt_sigpending - examine a pending signal that has been raised + * while blocked + * @set: stores pending signals + * @sigsetsize: size of sigset_t type or larger + */ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) { return do_sigpending(set, sigsetsize); @@ -2219,9 +2252,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) err |= __put_user(from->si_trapno, &to->si_trapno); #endif #ifdef BUS_MCEERR_AO - /* + /* * Other callers might not initialize the si_lsb field, - * so check explicitely for the right codes here. + * so check explicitly for the right codes here. */ if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); @@ -2250,6 +2283,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) #endif +/** + * sys_rt_sigtimedwait - synchronously wait for queued signals specified + * in @uthese + * @uthese: queued signals to wait for + * @uinfo: if non-null, the signal's siginfo is returned here + * @uts: upper bound on process time suspension + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, siginfo_t __user *, uinfo, const struct timespec __user *, uts, size_t, sigsetsize) @@ -2266,7 +2307,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, if (copy_from_user(&these, uthese, sizeof(these))) return -EFAULT; - + /* * Invert the set of allowed signals to get those we * want to block. @@ -2291,9 +2332,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, + (ts.tv_sec || ts.tv_nsec)); if (timeout) { - /* None ready -- temporarily unblock those we're + /* + * None ready -- temporarily unblock those we're * interested while we are sleeping in so that we'll - * be awakened when they arrive. */ + * be awakened when they arrive. + */ current->real_blocked = current->blocked; sigandsets(¤t->blocked, ¤t->blocked, &these); recalc_sigpending(); @@ -2325,6 +2368,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, return ret; } +/** + * sys_kill - send a signal to a process + * @pid: the PID of the process + * @sig: signal to be sent + */ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct siginfo info; @@ -2400,7 +2448,11 @@ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) return do_tkill(tgid, pid, sig); } -/* +/** + * sys_tkill - send signal to one specific task + * @pid: the PID of the task + * @sig: signal to be sent + * * Send a signal to only one task, even if it's a CLONE_THREAD task. */ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) @@ -2412,6 +2464,12 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) return do_tkill(0, pid, sig); } +/** + * sys_rt_sigqueueinfo - send signal information to a signal + * @pid: the PID of the thread + * @sig: signal to be sent + * @uinfo: signal info to be sent + */ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { @@ -2421,9 +2479,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, return -EFAULT; /* Not even root can pretend to send signals from the kernel. - Nor can they impersonate a kill(), which adds source info. */ - if (info.si_code >= 0) + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if (info.si_code >= 0 || info.si_code == SI_TKILL) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info.si_code < 0); return -EPERM; + } info.si_signo = sig; /* POSIX.1b doesn't mention process groups. */ @@ -2437,9 +2499,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) return -EINVAL; /* Not even root can pretend to send signals from the kernel. - Nor can they impersonate a kill(), which adds source info. */ - if (info->si_code >= 0) + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if (info->si_code >= 0 || info->si_code == SI_TKILL) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info->si_code < 0); return -EPERM; + } info->si_signo = sig; return do_send_specific(tgid, pid, sig, info); @@ -2531,12 +2597,11 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s error = -EINVAL; /* - * - * Note - this code used to test ss_flags incorrectly + * Note - this code used to test ss_flags incorrectly: * old code may have been written using ss_flags==0 * to mean ss_flags==SS_ONSTACK (as this was the only * way that worked) - this fix preserves that older - * mechanism + * mechanism. */ if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) goto out; @@ -2570,6 +2635,10 @@ out: #ifdef __ARCH_WANT_SYS_SIGPENDING +/** + * sys_sigpending - examine pending signals + * @set: where mask of pending signal is returned + */ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) { return do_sigpending(set, sizeof(*set)); @@ -2578,8 +2647,15 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) #endif #ifdef __ARCH_WANT_SYS_SIGPROCMASK -/* Some platforms have their own version with special arguments others - support only sys_rt_sigprocmask. */ +/** + * sys_sigprocmask - examine and change blocked signals + * @how: whether to add, remove, or set signals + * @set: signals to add or remove (if non-null) + * @oset: previous value of signal mask if non-null + * + * Some platforms have their own version with special arguments; + * others support only sys_rt_sigprocmask. + */ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, old_sigset_t __user *, oset) @@ -2632,6 +2708,13 @@ out: #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ #ifdef __ARCH_WANT_SYS_RT_SIGACTION +/** + * sys_rt_sigaction - alter an action taken by a process + * @sig: signal to be sent + * @act: new sigaction + * @oact: used to save the previous sigaction + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE4(rt_sigaction, int, sig, const struct sigaction __user *, act, struct sigaction __user *, oact, @@ -2718,6 +2801,12 @@ SYSCALL_DEFINE0(pause) #endif #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND +/** + * sys_rt_sigsuspend - replace the signal mask for a value with the + * @unewset value until a signal is received + * @unewset: new signal mask value + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) { sigset_t newset; diff --git a/kernel/smp.c b/kernel/smp.c index 7cbd0f2..73a1951 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -604,6 +604,87 @@ void ipi_call_unlock_irq(void) } #endif /* USE_GENERIC_SMP_HELPERS */ +/* Setup configured maximum number of CPUs to activate */ +unsigned int setup_max_cpus = NR_CPUS; +EXPORT_SYMBOL(setup_max_cpus); + + +/* + * Setup routine for controlling SMP activation + * + * Command-line option of "nosmp" or "maxcpus=0" will disable SMP + * activation entirely (the MPS table probe still happens, though). + * + * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer + * greater than 0, limits the maximum number of CPUs activated in + * SMP mode to <NUM>. + */ + +void __weak arch_disable_smp_support(void) { } + +static int __init nosmp(char *str) +{ + setup_max_cpus = 0; + arch_disable_smp_support(); + + return 0; +} + +early_param("nosmp", nosmp); + +/* this is hard limit */ +static int __init nrcpus(char *str) +{ + int nr_cpus; + + get_option(&str, &nr_cpus); + if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) + nr_cpu_ids = nr_cpus; + + return 0; +} + +early_param("nr_cpus", nrcpus); + +static int __init maxcpus(char *str) +{ + get_option(&str, &setup_max_cpus); + if (setup_max_cpus == 0) + arch_disable_smp_support(); + + return 0; +} + +early_param("maxcpus", maxcpus); + +/* Setup number of possible processor ids */ +int nr_cpu_ids __read_mostly = NR_CPUS; +EXPORT_SYMBOL(nr_cpu_ids); + +/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ +void __init setup_nr_cpu_ids(void) +{ + nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; +} + +/* Called by boot processor to activate the rest. */ +void __init smp_init(void) +{ + unsigned int cpu; + + /* FIXME: This should be done in userspace --RR */ + for_each_present_cpu(cpu) { + if (num_online_cpus() >= setup_max_cpus) + break; + if (!cpu_online(cpu)) + cpu_up(cpu); + } + + /* Any cleanup work */ + printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); + smp_cpus_done(setup_max_cpus); +} + /* * Call a function on all processors. May be used during early boot while * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead diff --git a/kernel/softirq.c b/kernel/softirq.c index 56e5dec..174f976 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -567,7 +567,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data) /** * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks * @ttimer: tasklet_hrtimer which is initialized - * @function: hrtimer callback funtion which gets called from softirq context + * @function: hrtimer callback function which gets called from softirq context * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) */ @@ -845,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); + p = kthread_create_on_node(run_ksoftirqd, + hcpu, + cpu_to_node(hotcpu), + "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); return notifier_from_errno(PTR_ERR(p)); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2df820b..e3516b2 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, case CPU_UP_PREPARE: BUG_ON(stopper->thread || stopper->enabled || !list_empty(&stopper->works)); - p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", - cpu); + p = kthread_create_on_node(cpu_stopper_thread, + stopper, + cpu_to_node(cpu), + "migration/%d", cpu); if (IS_ERR(p)) return notifier_from_errno(PTR_ERR(p)); get_task_struct(p); diff --git a/kernel/sys.c b/kernel/sys.c index 1ad48b3..af468ed 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -120,16 +120,33 @@ EXPORT_SYMBOL(cad_pid); void (*pm_power_off_prepare)(void); /* + * Returns true if current's euid is same as p's uid or euid, + * or has CAP_SYS_NICE to p's user_ns. + * + * Called with rcu_read_lock, creds are safe + */ +static bool set_one_prio_perm(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred = __task_cred(p); + + if (pcred->user->user_ns == cred->user->user_ns && + (pcred->uid == cred->euid || + pcred->euid == cred->euid)) + return true; + if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) + return true; + return false; +} + +/* * set the priority of a task * - the caller must hold the RCU read lock */ static int set_one_prio(struct task_struct *p, int niceval, int error) { - const struct cred *cred = current_cred(), *pcred = __task_cred(p); int no_nice; - if (pcred->uid != cred->euid && - pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) { + if (!set_one_prio_perm(p)) { error = -EPERM; goto out; } @@ -506,7 +523,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) if (rgid != (gid_t) -1) { if (old->gid == rgid || old->egid == rgid || - capable(CAP_SETGID)) + nsown_capable(CAP_SETGID)) new->gid = rgid; else goto error; @@ -515,7 +532,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) if (old->gid == egid || old->egid == egid || old->sgid == egid || - capable(CAP_SETGID)) + nsown_capable(CAP_SETGID)) new->egid = egid; else goto error; @@ -550,7 +567,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) old = current_cred(); retval = -EPERM; - if (capable(CAP_SETGID)) + if (nsown_capable(CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = gid; else if (gid == old->gid || gid == old->sgid) new->egid = new->fsgid = gid; @@ -617,7 +634,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) new->uid = ruid; if (old->uid != ruid && old->euid != ruid && - !capable(CAP_SETUID)) + !nsown_capable(CAP_SETUID)) goto error; } @@ -626,7 +643,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) if (old->uid != euid && old->euid != euid && old->suid != euid && - !capable(CAP_SETUID)) + !nsown_capable(CAP_SETUID)) goto error; } @@ -674,7 +691,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) old = current_cred(); retval = -EPERM; - if (capable(CAP_SETUID)) { + if (nsown_capable(CAP_SETUID)) { new->suid = new->uid = uid; if (uid != old->uid) { retval = set_user(new); @@ -716,7 +733,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) old = current_cred(); retval = -EPERM; - if (!capable(CAP_SETUID)) { + if (!nsown_capable(CAP_SETUID)) { if (ruid != (uid_t) -1 && ruid != old->uid && ruid != old->euid && ruid != old->suid) goto error; @@ -780,7 +797,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) old = current_cred(); retval = -EPERM; - if (!capable(CAP_SETGID)) { + if (!nsown_capable(CAP_SETGID)) { if (rgid != (gid_t) -1 && rgid != old->gid && rgid != old->egid && rgid != old->sgid) goto error; @@ -840,7 +857,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) if (uid == old->uid || uid == old->euid || uid == old->suid || uid == old->fsuid || - capable(CAP_SETUID)) { + nsown_capable(CAP_SETUID)) { if (uid != old_fsuid) { new->fsuid = uid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) @@ -873,7 +890,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) if (gid == old->gid || gid == old->egid || gid == old->sgid || gid == old->fsgid || - capable(CAP_SETGID)) { + nsown_capable(CAP_SETGID)) { if (gid != old_fsgid) { new->fsgid = gid; goto change_okay; @@ -1181,8 +1198,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; down_write(&uts_sem); @@ -1230,7 +1248,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; @@ -1345,6 +1363,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, rlim = tsk->signal->rlim + resource; task_lock(tsk->group_leader); if (new_rlim) { + /* Keep the capable check against init_user_ns until + cgroups can contain all limits */ if (new_rlim->rlim_max > rlim->rlim_max && !capable(CAP_SYS_RESOURCE)) retval = -EPERM; @@ -1388,19 +1408,22 @@ static int check_prlimit_permission(struct task_struct *task) { const struct cred *cred = current_cred(), *tcred; - tcred = __task_cred(task); - if (current != task && - (cred->uid != tcred->euid || - cred->uid != tcred->suid || - cred->uid != tcred->uid || - cred->gid != tcred->egid || - cred->gid != tcred->sgid || - cred->gid != tcred->gid) && - !capable(CAP_SYS_RESOURCE)) { - return -EPERM; - } + if (current == task) + return 0; - return 0; + tcred = __task_cred(task); + if (cred->user->user_ns == tcred->user->user_ns && + (cred->uid == tcred->euid && + cred->uid == tcred->suid && + cred->uid == tcred->uid && + cred->gid == tcred->egid && + cred->gid == tcred->sgid && + cred->gid == tcred->gid)) + return 0; + if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) + return 0; + + return -EPERM; } SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 40245d69..c0bb324 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -117,6 +117,7 @@ static int neg_one = -1; static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; +static int __maybe_unused three = 3; static unsigned long one_ul = 1; static int one_hundred = 100; #ifdef CONFIG_PRINTK @@ -169,6 +170,11 @@ static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_PRINTK +static int proc_dmesg_restrict(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif + #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; @@ -706,7 +712,7 @@ static struct ctl_table kern_table[] = { .data = &kptr_restrict, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dmesg_restrict, .extra1 = &zero, .extra2 = &two, }, @@ -971,14 +977,18 @@ static struct ctl_table vm_table[] = { .data = &sysctl_overcommit_memory, .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "panic_on_oom", .data = &sysctl_panic_on_oom, .maxlen = sizeof(sysctl_panic_on_oom), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "oom_kill_allocating_task", @@ -1006,7 +1016,8 @@ static struct ctl_table vm_table[] = { .data = &page_cluster, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "dirty_background_ratio", @@ -1054,7 +1065,8 @@ static struct ctl_table vm_table[] = { .data = &dirty_expire_interval, .maxlen = sizeof(dirty_expire_interval), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "nr_pdflush_threads", @@ -1130,6 +1142,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = drop_caches_sysctl_handler, + .extra1 = &one, + .extra2 = &three, }, #ifdef CONFIG_COMPACTION { @@ -2385,6 +2399,17 @@ static int proc_taint(struct ctl_table *table, int write, return err; } +#ifdef CONFIG_PRINTK +static int proc_dmesg_restrict(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + struct do_proc_dointvec_minmax_conv_param { int *min; int *max; diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 10b90d8..4e4932a 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) const char *fail = NULL; if (table->parent) { - if (table->procname && !table->parent->procname) + if (!table->parent->procname) set_fail(&fail, table, "Parent without procname"); } - if (!table->procname) - set_fail(&fail, table, "No procname"); if (table->child) { if (table->data) set_fail(&fail, table, "Directory with data?"); @@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) set_fail(&fail, table, "No maxlen"); } #ifdef CONFIG_PROC_SYSCTL - if (table->procname && !table->proc_handler) + if (!table->proc_handler) set_fail(&fail, table, "No proc_handler"); #endif -#if 0 - if (!table->procname && table->proc_handler) - set_fail(&fail, table, "proc_handler without procname"); -#endif sysctl_check_leaf(namespaces, table, &fail); } if (table->mode > 0777) diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 3971c6b..9ffea36 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -685,7 +685,7 @@ static int __init taskstats_init(void) goto err_cgroup_ops; family_registered = 1; - printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); + pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); return 0; err_cgroup_ops: genl_unregister_ops(&family, &taskstats_ops); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index b2fa506..a470154 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -34,7 +34,7 @@ * inaccuracies caused by missed or lost timer * interrupts and the inability for the timer * interrupt hardware to accuratly tick at the - * requested HZ value. It is also not reccomended + * requested HZ value. It is also not recommended * for "tick-less" systems. */ #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5f1bb8e..f6117a4 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -652,6 +652,8 @@ int do_adjtimex(struct timex *txc) struct timespec delta; delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; + if (!capable(CAP_SYS_TIME)) + return -EPERM; if (!(txc->modes & ADJ_NANO)) delta.tv_nsec *= 1000; result = timekeeping_inject_offset(&delta); diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 25028dd..c340ca6 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -19,7 +19,6 @@ */ #include <linux/device.h> #include <linux/file.h> -#include <linux/mutex.h> #include <linux/posix-clock.h> #include <linux/slab.h> #include <linux/syscalls.h> @@ -34,19 +33,19 @@ static struct posix_clock *get_posix_clock(struct file *fp) { struct posix_clock *clk = fp->private_data; - mutex_lock(&clk->mutex); + down_read(&clk->rwsem); if (!clk->zombie) return clk; - mutex_unlock(&clk->mutex); + up_read(&clk->rwsem); return NULL; } static void put_posix_clock(struct posix_clock *clk) { - mutex_unlock(&clk->mutex); + up_read(&clk->rwsem); } static ssize_t posix_clock_read(struct file *fp, char __user *buf, @@ -156,7 +155,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) struct posix_clock *clk = container_of(inode->i_cdev, struct posix_clock, cdev); - mutex_lock(&clk->mutex); + down_read(&clk->rwsem); if (clk->zombie) { err = -ENODEV; @@ -172,7 +171,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) fp->private_data = clk; } out: - mutex_unlock(&clk->mutex); + up_read(&clk->rwsem); return err; } @@ -211,25 +210,20 @@ int posix_clock_register(struct posix_clock *clk, dev_t devid) int err; kref_init(&clk->kref); - mutex_init(&clk->mutex); + init_rwsem(&clk->rwsem); cdev_init(&clk->cdev, &posix_clock_file_operations); clk->cdev.owner = clk->ops.owner; err = cdev_add(&clk->cdev, devid, 1); - if (err) - goto no_cdev; return err; -no_cdev: - mutex_destroy(&clk->mutex); - return err; } EXPORT_SYMBOL_GPL(posix_clock_register); static void delete_clock(struct kref *kref) { struct posix_clock *clk = container_of(kref, struct posix_clock, kref); - mutex_destroy(&clk->mutex); + if (clk->release) clk->release(clk); } @@ -238,9 +232,9 @@ void posix_clock_unregister(struct posix_clock *clk) { cdev_del(&clk->cdev); - mutex_lock(&clk->mutex); + down_write(&clk->rwsem); clk->zombie = true; - mutex_unlock(&clk->mutex); + up_write(&clk->rwsem); kref_put(&clk->kref, delete_clock); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3bd7e3d..8ad5d57 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -14,7 +14,7 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/sched.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/clocksource.h> #include <linux/jiffies.h> #include <linux/time.h> @@ -597,13 +597,12 @@ static struct timespec timekeeping_suspend_time; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. - * @dev: unused * * This is for the generic clocksource timekeeping. * xtime/wall_to_monotonic/jiffies/etc are * still managed by arch specific suspend/resume code. */ -static int timekeeping_resume(struct sys_device *dev) +static void timekeeping_resume(void) { unsigned long flags; struct timespec ts; @@ -632,11 +631,9 @@ static int timekeeping_resume(struct sys_device *dev) /* Resume hrtimers */ hres_timers_resume(); - - return 0; } -static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) +static int timekeeping_suspend(void) { unsigned long flags; @@ -654,26 +651,18 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) } /* sysfs resume/suspend bits for timekeeping */ -static struct sysdev_class timekeeping_sysclass = { - .name = "timekeeping", +static struct syscore_ops timekeeping_syscore_ops = { .resume = timekeeping_resume, .suspend = timekeeping_suspend, }; -static struct sys_device device_timer = { - .id = 0, - .cls = &timekeeping_sysclass, -}; - -static int __init timekeeping_init_device(void) +static int __init timekeeping_init_ops(void) { - int error = sysdev_class_register(&timekeeping_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; + register_syscore_ops(&timekeeping_syscore_ops); + return 0; } -device_initcall(timekeeping_init_device); +device_initcall(timekeeping_init_ops); /* * If the error is already larger, we look ahead even further diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 2f3b585..a5d0a3a 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, unsigned int timer_flag) { /* - * It doesnt matter which lock we take: + * It doesn't matter which lock we take: */ raw_spinlock_t *lock; struct entry *entry, input; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index cbafed7..6957aa2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q) * **/ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, - u32 what) + u32 what) { struct blk_trace *bt = q->blk_trace; - int rw = rq->cmd_flags & 0x03; if (likely(!bt)) return; - if (rq->cmd_flags & REQ_DISCARD) - rw |= REQ_DISCARD; - - if (rq->cmd_flags & REQ_SECURE) - rw |= REQ_SECURE; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, + __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags, what, rq->errors, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, - what, rq->errors, 0, NULL); + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq->cmd_flags, what, rq->errors, 0, NULL); } } @@ -857,29 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); } -static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) +static void blk_add_trace_unplug(void *ignore, struct request_queue *q, + unsigned int depth, bool explicit) { struct blk_trace *bt = q->blk_trace; if (bt) { - unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; - __be64 rpdu = cpu_to_be64(pdu); - - __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, - sizeof(rpdu), &rpdu); - } -} + __be64 rpdu = cpu_to_be64(depth); + u32 what; -static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) { - unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; - __be64 rpdu = cpu_to_be64(pdu); + if (explicit) + what = BLK_TA_UNPLUG_IO; + else + what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, - sizeof(rpdu), &rpdu); + __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); } } @@ -1022,9 +1007,7 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); - ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); - WARN_ON(ret); - ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); + ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); WARN_ON(ret); ret = register_trace_block_split(blk_add_trace_split, NULL); WARN_ON(ret); @@ -1039,8 +1022,7 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); unregister_trace_block_split(blk_add_trace_split, NULL); - unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); - unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); + unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 888b611..ee24fa1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1268,7 +1268,7 @@ static int ftrace_update_code(struct module *mod) p->flags = 0L; /* - * Do the initial record convertion from mcount jump + * Do the initial record conversion from mcount jump * to the NOP instructions. */ if (!ftrace_code_disable(mod, p)) { @@ -1467,7 +1467,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) return t_hash_next(m, pos); (*pos)++; - iter->pos = *pos; + iter->pos = iter->func_pos = *pos; if (iter->flags & FTRACE_ITER_PRINTALL) return t_hash_start(m, pos); @@ -1502,7 +1502,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (!rec) return t_hash_start(m, pos); - iter->func_pos = *pos; iter->func = rec; return iter; @@ -3426,7 +3425,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->ftrace_timestamp = 0; - /* make curr_ret_stack visable before we add the ret_stack */ + /* make curr_ret_stack visible before we add the ret_stack */ smp_wmb(); t->ret_stack = ret_stack; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d9c8bca..0ef7b4b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1478,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage) return local_read(&bpage->entries) & RB_WRITE_MASK; } -/* Size is determined by what has been commited */ +/* Size is determined by what has been committed */ static inline unsigned rb_page_size(struct buffer_page *bpage) { return rb_page_commit(bpage); @@ -2932,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) /* * cpu_buffer->pages just needs to point to the buffer, it * has no specific buffer page to point to. Lets move it out - * of our way so we don't accidently swap it. + * of our way so we don't accidentally swap it. */ cpu_buffer->pages = reader->list.prev; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9541c27..d38c16a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3239,7 +3239,7 @@ waitagain: trace_seq_init(&iter->seq); /* - * If there was nothing to send to user, inspite of consuming trace + * If there was nothing to send to user, in spite of consuming trace * entries, go back to wait for more entries. */ if (sret == -EBUSY) diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 685a67d..6302747 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void) } /* - * trace_clock(): 'inbetween' trace clock. Not completely serialized, + * trace_clock(): 'between' trace clock. Not completely serialized, * but not completely incorrect when crossing CPUs either. * * This is based on cpu_clock(), which will allow at most ~1 jiffy of diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 1516cb3..e32744c 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -27,7 +27,7 @@ * in the structure. * * * for structures within structures, the format of the internal - * structure is layed out. This allows the internal structure + * structure is laid out. This allows the internal structure * to be deciphered for the format file. Although these macros * may become out of sync with the internal structure, they * will create a compile error if it happens. Since the diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 76b0598..962cdb2 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -905,7 +905,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, * * returns 1 if * - we are inside irq code - * - we just extered irq code + * - we just entered irq code * * retunns 0 if * - funcgraph-interrupts option is set diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 92b6e1e..a4969b4 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -80,7 +80,7 @@ static struct tracer_flags tracer_flags = { * skip the latency if the sequence has changed - some other section * did a maximum and could disturb our measurement with serial console * printouts, etc. Truly coinciding maximum latencies should be rare - * and what happens together happens separately as well, so this doesnt + * and what happens together happens separately as well, so this doesn't * decrease the validity of the maximum found: */ static __cacheline_aligned_in_smp unsigned long max_sequence; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8435b43..35d55a3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1839,7 +1839,7 @@ static void unregister_probe_event(struct trace_probe *tp) kfree(tp->call.print_fmt); } -/* Make a debugfs interface for controling probe points */ +/* Make a debugfs interface for controlling probe points */ static __init int init_kprobe_trace(void) { struct dentry *d_tracer; diff --git a/kernel/uid16.c b/kernel/uid16.c index 4192098..51c6e89 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!capable(CAP_SETGID)) + if (!nsown_capable(CAP_SETGID)) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index eb27fd3..92cb706 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c @@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register); /* * Removes a registered user return notifier. Must be called from atomic - * context, and from the same cpu registration occured in. + * context, and from the same cpu registration occurred in. */ void user_return_notifier_unregister(struct user_return_notifier *urn) { diff --git a/kernel/user.c b/kernel/user.c index 5c598ca..9e03e9c 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -17,9 +17,13 @@ #include <linux/module.h> #include <linux/user_namespace.h> +/* + * userns count is 1 for root user, 1 for init_uts_ns, + * and 1 for... ? + */ struct user_namespace init_user_ns = { .kref = { - .refcount = ATOMIC_INIT(2), + .refcount = ATOMIC_INIT(3), }, .creator = &root_user, }; @@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep; */ static DEFINE_SPINLOCK(uidhash_lock); -/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */ +/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ struct user_struct root_user = { .__count = ATOMIC_INIT(2), .processes = ATOMIC_INIT(1), diff --git a/kernel/utsname.c b/kernel/utsname.c index 8a82b4b..4464617 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -14,6 +14,7 @@ #include <linux/utsname.h> #include <linux/err.h> #include <linux/slab.h> +#include <linux/user_namespace.h> static struct uts_namespace *create_uts_ns(void) { @@ -30,7 +31,8 @@ static struct uts_namespace *create_uts_ns(void) * @old_ns: namespace to clone * Return NULL on error (failure to kmalloc), new ns otherwise */ -static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) +static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, + struct uts_namespace *old_ns) { struct uts_namespace *ns; @@ -40,6 +42,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); up_read(&uts_sem); return ns; } @@ -50,8 +53,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) +struct uts_namespace *copy_utsname(unsigned long flags, + struct task_struct *tsk) { + struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; struct uts_namespace *new_ns; BUG_ON(!old_ns); @@ -60,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol if (!(flags & CLONE_NEWUTS)) return old_ns; - new_ns = clone_uts_ns(old_ns); + new_ns = clone_uts_ns(tsk, old_ns); put_uts_ns(old_ns); return new_ns; @@ -71,5 +76,6 @@ void free_uts_ns(struct kref *kref) struct uts_namespace *ns; ns = container_of(kref, struct uts_namespace, kref); + put_user_ns(ns->user_ns); kfree(ns); } diff --git a/kernel/wait.c b/kernel/wait.c index b0310eb..f45ea8d 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait); * woken up through the queue. * * This prevents waiter starvation where an exclusive waiter - * aborts and is woken up concurrently and noone wakes up + * aborts and is woken up concurrently and no one wakes up * the next waiter. */ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 18bb157..140dce7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); * Should we panic when a soft-lockup or hard-lockup occurs: */ #ifdef CONFIG_HARDLOCKUP_DETECTOR -static int hardlockup_panic; +static int hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; static int __init hardlockup_panic_setup(char *str) { if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) watchdog_enabled = 0; return 1; @@ -415,19 +418,22 @@ static int watchdog_prepare_cpu(int cpu) static int watchdog_enable(int cpu) { struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - int err; + int err = 0; /* enable the perf event */ err = watchdog_nmi_enable(cpu); - if (err) - return err; + + /* Regardless of err above, fall through and start softlockup */ /* create the watchdog thread */ if (!p) { p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); if (IS_ERR(p)) { printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); - return PTR_ERR(p); + if (!err) + /* if hardlockup hasn't already set this */ + err = PTR_ERR(p); + goto out; } kthread_bind(p, cpu); per_cpu(watchdog_touch_ts, cpu) = 0; @@ -435,7 +441,8 @@ static int watchdog_enable(int cpu) wake_up_process(p); } - return 0; +out: + return err; } static void watchdog_disable(int cpu) @@ -547,7 +554,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #endif /* CONFIG_HOTPLUG_CPU */ } - return notifier_from_errno(err); + + /* + * hardlockup and softlockup are not important enough + * to block cpu bring up. Just always succeed and + * rely on printk output to flag problems. + */ + return NOTIFY_OK; } static struct notifier_block __cpuinitdata cpu_nfb = { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5ca7ce9..8859a41 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1291,7 +1291,7 @@ __acquires(&gcwq->lock) return true; spin_unlock_irq(&gcwq->lock); - /* CPU has come up inbetween, retry migration */ + /* CPU has come up in between, retry migration */ cpu_relax(); } } @@ -1366,8 +1366,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) worker->id = id; if (!on_unbound_cpu) - worker->task = kthread_create(worker_thread, worker, - "kworker/%u:%d", gcwq->cpu, id); + worker->task = kthread_create_on_node(worker_thread, + worker, + cpu_to_node(gcwq->cpu), + "kworker/%u:%d", gcwq->cpu, id); else worker->task = kthread_create(worker_thread, worker, "kworker/u:%d", id); |