diff options
author | Jiri Kosina <jkosina@suse.cz> | 2011-04-26 10:22:15 +0200 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2011-04-26 10:22:59 +0200 |
commit | 07f9479a40cc778bc1462ada11f95b01360ae4ff (patch) | |
tree | 0676cf38df3844004bb3ebfd99dfa67a4a8998f5 /kernel | |
parent | 9d5e6bdb3013acfb311ab407eeca0b6a6a3dedbf (diff) | |
parent | cd2e49e90f1cae7726c9a2c54488d881d7f1cd1c (diff) | |
download | op-kernel-dev-07f9479a40cc778bc1462ada11f95b01360ae4ff.zip op-kernel-dev-07f9479a40cc778bc1462ada11f95b01360ae4ff.tar.gz |
Merge branch 'master' into for-next
Fast-forwarded to current state of Linus' tree as there are patches to be
applied for files that didn't exist on the old branch.
Diffstat (limited to 'kernel')
96 files changed, 1247 insertions, 911 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 353d3fe..85cbfb3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 37b2bea..e99dda0 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -607,7 +607,7 @@ void audit_trim_trees(void) spin_lock(&hash_lock); list_for_each_entry(node, &tree->chunks, list) { struct audit_chunk *chunk = find_chunk(node); - /* this could be NULL if the watch is dieing else where... */ + /* this could be NULL if the watch is dying else where... */ struct inode *inode = chunk->mark.i.inode; node->index |= 1U<<31; if (iterate_mounts(compare_root, inode, root_mnt)) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f49a031..b33513a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1011,7 +1011,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, /* * to_send and len_sent accounting are very loose estimates. We aren't * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundry) + * within about 500 bytes (next page boundary) * * why snprintf? an int is up to 12 digits long. if we just assumed when * logging that a[%d]= was going to be 16 characters long we would be wasting diff --git a/kernel/bounds.c b/kernel/bounds.c index 98a51f2..0c9b862 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -9,11 +9,13 @@ #include <linux/page-flags.h> #include <linux/mmzone.h> #include <linux/kbuild.h> +#include <linux/page_cgroup.h> void foo(void) { /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); + DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); /* End of constants */ } diff --git a/kernel/capability.c b/kernel/capability.c index 9e9385f..bf0c734 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -14,6 +14,7 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/pid_namespace.h> +#include <linux/user_namespace.h> #include <asm/uaccess.h> /* @@ -290,6 +291,60 @@ error: } /** + * has_capability - Does a task have a capability in init_user_ns + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to the initial user namespace, false if not. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_capability(struct task_struct *t, int cap) +{ + int ret = security_real_capable(t, &init_user_ns, cap); + + return (ret == 0); +} + +/** + * has_capability - Does a task have a capability in a specific user ns + * @t: The task in question + * @ns: target user namespace + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to the specified user namespace, false if not. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_ns_capability(struct task_struct *t, + struct user_namespace *ns, int cap) +{ + int ret = security_real_capable(t, ns, cap); + + return (ret == 0); +} + +/** + * has_capability_noaudit - Does a task have a capability (unaudited) + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to init_user_ns, false if not. Don't write an + * audit message for the check. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_capability_noaudit(struct task_struct *t, int cap) +{ + int ret = security_real_capable_noaudit(t, &init_user_ns, cap); + + return (ret == 0); +} + +/** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * @@ -299,17 +354,48 @@ error: * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ -int capable(int cap) +bool capable(int cap) +{ + return ns_capable(&init_user_ns, cap); +} +EXPORT_SYMBOL(capable); + +/** + * ns_capable - Determine if the current task has a superior capability in effect + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable(struct user_namespace *ns, int cap) { if (unlikely(!cap_valid(cap))) { printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); BUG(); } - if (security_capable(current_cred(), cap) == 0) { + if (security_capable(ns, current_cred(), cap) == 0) { current->flags |= PF_SUPERPRIV; - return 1; + return true; } - return 0; + return false; } -EXPORT_SYMBOL(capable); +EXPORT_SYMBOL(ns_capable); + +/** + * task_ns_capable - Determine whether current task has a superior + * capability targeted at a specific task's user namespace. + * @t: The task whose user namespace is targeted. + * @cap: The capability in question. + * + * Return true if it does, false otherwise. + */ +bool task_ns_capable(struct task_struct *t, int cap) +{ + return ns_capable(task_cred_xxx(t, user)->user_ns, cap); +} +EXPORT_SYMBOL(task_ns_capable); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 95362d1..25c7eb5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -157,7 +157,7 @@ struct css_id { }; /* - * cgroup_event represents events which userspace want to recieve. + * cgroup_event represents events which userspace want to receive. */ struct cgroup_event { /* @@ -1813,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) /* Update the css_set linked lists if we're using them */ write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) { - list_del(&tsk->cg_list); - list_add(&tsk->cg_list, &newcg->tasks); - } + if (!list_empty(&tsk->cg_list)) + list_move(&tsk->cg_list, &newcg->tasks); write_unlock(&css_set_lock); for_each_subsys(root, ss) { @@ -3655,12 +3653,12 @@ again: spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) - list_del(&cgrp->release_list); + list_del_init(&cgrp->release_list); spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ - list_del(&cgrp->sibling); + list_del_init(&cgrp->sibling); cgroup_unlock_hierarchy(cgrp->root); d = dget(cgrp->dentry); @@ -3879,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) subsys[ss->subsys_id] = NULL; /* remove subsystem from rootnode's list of subsystems */ - list_del(&ss->sibling); + list_del_init(&ss->sibling); /* * disentangle the css from all css_sets attached to the dummytop. as @@ -4241,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) if (!list_empty(&tsk->cg_list)) { write_lock(&css_set_lock); if (!list_empty(&tsk->cg_list)) - list_del(&tsk->cg_list); + list_del_init(&tsk->cg_list); write_unlock(&css_set_lock); } diff --git a/kernel/cpu.c b/kernel/cpu.c index 156cc55..12b7458 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -126,7 +126,7 @@ static void cpu_hotplug_done(void) #else /* #if CONFIG_HOTPLUG_CPU */ static void cpu_hotplug_begin(void) {} static void cpu_hotplug_done(void) {} -#endif /* #esle #if CONFIG_HOTPLUG_CPU */ +#endif /* #else #if CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ int __ref register_cpu_notifier(struct notifier_block *nb) @@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v) { BUG_ON(cpu_notify(val, v)); } - EXPORT_SYMBOL(register_cpu_notifier); void __ref unregister_cpu_notifier(struct notifier_block *nb) @@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param) return err; cpu_notify(CPU_DYING | param->mod, param->hcpu); - return 0; } @@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -EINVAL; cpu_hotplug_begin(); + err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err) { nr_calls--; @@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; - printk("%s: attempt to bring up CPU %u failed\n", + printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", __func__, cpu); goto out_notify; } @@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void) if (cpumask_empty(frozen_cpus)) goto out; - printk("Enabling non-boot CPUs ...\n"); + printk(KERN_INFO "Enabling non-boot CPUs ...\n"); arch_enable_nonboot_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { - printk("CPU%d is up\n", cpu); + printk(KERN_INFO "CPU%d is up\n", cpu); continue; } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); @@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) */ /* cpu_bit_bitmap[0] is empty - so we can back into it */ -#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) +#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x)) #define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) #define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) #define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e92e981..33eee16 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; - NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); - - if (!newmems) - return; + static nodemask_t newmems; /* protected by cgroup_mutex */ cs = cgroup_cs(scan->cg); - guarantee_online_mems(cs, newmems); - - cpuset_change_task_nodemask(p, newmems); + guarantee_online_mems(cs, &newmems); - NODEMASK_FREE(newmems); + cpuset_change_task_nodemask(p, &newmems); mm = get_task_mm(p); if (!mm) @@ -1438,44 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); - NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); - - if (from == NULL || to == NULL) - goto alloc_fail; + static nodemask_t to; /* protected by cgroup_mutex */ if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); } else { guarantee_online_cpus(cs, cpus_attach); } - guarantee_online_mems(cs, to); + guarantee_online_mems(cs, &to); /* do per-task migration stuff possibly for each in the threadgroup */ - cpuset_attach_task(tsk, to, cs); + cpuset_attach_task(tsk, &to, cs); if (threadgroup) { struct task_struct *c; rcu_read_lock(); list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - cpuset_attach_task(c, to, cs); + cpuset_attach_task(c, &to, cs); } rcu_read_unlock(); } /* change mm; only needs to be done once even if threadgroup */ - *from = oldcs->mems_allowed; - *to = cs->mems_allowed; + to = cs->mems_allowed; mm = get_task_mm(tsk); if (mm) { - mpol_rebind_mm(mm, to); + mpol_rebind_mm(mm, &to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, from, to); + cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to); mmput(mm); } - -alloc_fail: - NODEMASK_FREE(from); - NODEMASK_FREE(to); } /* The various types of files and directories in a cpuset file system */ @@ -1610,34 +1596,26 @@ out: * across a page fault. */ -static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) +static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { - int ret; + size_t count; mutex_lock(&callback_mutex); - ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); + count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); mutex_unlock(&callback_mutex); - return ret; + return count; } -static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) +static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) { - NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); - int retval; - - if (mask == NULL) - return -ENOMEM; + size_t count; mutex_lock(&callback_mutex); - *mask = cs->mems_allowed; + count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed); mutex_unlock(&callback_mutex); - retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); - - NODEMASK_FREE(mask); - - return retval; + return count; } static ssize_t cpuset_common_file_read(struct cgroup *cont, @@ -1862,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, cs = cgroup_cs(cgroup); parent_cs = cgroup_cs(parent); + mutex_lock(&callback_mutex); cs->mems_allowed = parent_cs->mems_allowed; cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); + mutex_unlock(&callback_mutex); return; } @@ -2066,10 +2046,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) struct cpuset *cp; /* scans cpusets being updated */ struct cpuset *child; /* scans child cpusets of cp */ struct cgroup *cont; - NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); - - if (oldmems == NULL) - return; + static nodemask_t oldmems; /* protected by cgroup_mutex */ list_add_tail((struct list_head *)&root->stack_list, &queue); @@ -2086,7 +2063,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; - *oldmems = cp->mems_allowed; + oldmems = cp->mems_allowed; /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); @@ -2102,10 +2079,9 @@ static void scan_for_empty_cpusets(struct cpuset *root) remove_tasks_in_empty_cpuset(cp); else { update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, oldmems, NULL); + update_tasks_nodemask(cp, &oldmems, NULL); } } - NODEMASK_FREE(oldmems); } /* @@ -2147,19 +2123,16 @@ void cpuset_update_active_cpus(void) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); - - if (oldmems == NULL) - return NOTIFY_DONE; + static nodemask_t oldmems; /* protected by cgroup_mutex */ cgroup_lock(); switch (action) { case MEM_ONLINE: - *oldmems = top_cpuset.mems_allowed; + oldmems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, oldmems, NULL); + update_tasks_nodemask(&top_cpuset, &oldmems, NULL); break; case MEM_OFFLINE: /* @@ -2173,7 +2146,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self, } cgroup_unlock(); - NODEMASK_FREE(oldmems); return NOTIFY_OK; } #endif diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c new file mode 100644 index 0000000..5f85690 --- /dev/null +++ b/kernel/crash_dump.c @@ -0,0 +1,34 @@ +#include <linux/kernel.h> +#include <linux/crash_dump.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/module.h> + +/* + * If we have booted due to a crash, max_pfn will be a very low value. We need + * to know the amount of memory that the previous kernel used. + */ +unsigned long saved_max_pfn; + +/* + * stores the physical address of elf header of crash image + * + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence put + * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + +/* + * elfcorehdr= specifies the location of elf core header stored by the crashed + * kernel. This option will be passed by kexec loader to the capture kernel. + */ +static int __init setup_elfcorehdr(char *arg) +{ + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; +} +early_param("elfcorehdr", setup_elfcorehdr); diff --git a/kernel/cred.c b/kernel/cred.c index 2343c132..5557b55 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -741,6 +741,12 @@ int set_create_files_as(struct cred *new, struct inode *inode) } EXPORT_SYMBOL(set_create_files_as); +struct user_namespace *current_user_ns(void) +{ + return _current_user_ns(); +} +EXPORT_SYMBOL(current_user_ns); + #ifdef CONFIG_DEBUG_CREDENTIALS bool creds_are_invalid(const struct cred *cred) diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index cefd4a1..bad6786 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -538,7 +538,7 @@ return_normal: /* * For single stepping, try to only enter on the processor - * that was single stepping. To gaurd against a deadlock, the + * that was single stepping. To guard against a deadlock, the * kernel will only try for the value of sstep_tries before * giving up and continuing on. */ diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 481a7bd..a11db95 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) put_packet(remcom_out_buffer); return 0; } + +/** + * gdbstub_exit - Send an exit message to GDB + * @status: The exit code to report. + */ +void gdbstub_exit(int status) +{ + unsigned char checksum, ch, buffer[3]; + int loop; + + buffer[0] = 'W'; + buffer[1] = hex_asc_hi(status); + buffer[2] = hex_asc_lo(status); + + dbg_io_ops->write_char('$'); + checksum = 0; + + for (loop = 0; loop < 3; loop++) { + ch = buffer[loop]; + checksum += ch; + dbg_io_ops->write_char(ch); + } + + dbg_io_ops->write_char('#'); + dbg_io_ops->write_char(hex_asc_hi(checksum)); + dbg_io_ops->write_char(hex_asc_lo(checksum)); + + /* make sure the output is flushed, lest the bootloader clobber it */ + dbg_io_ops->flush(); +} diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index bd3e8e2..be14779 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -78,7 +78,7 @@ static unsigned int kdb_continue_catastrophic; static kdbtab_t *kdb_commands; #define KDB_BASE_CMD_MAX 50 static int kdb_max_commands = KDB_BASE_CMD_MAX; -static kdbtab_t kdb_base_commands[50]; +static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX]; #define for_each_kdbcmd(cmd, num) \ for ((cmd) = kdb_base_commands, (num) = 0; \ num < kdb_max_commands; \ @@ -441,9 +441,9 @@ static int kdb_check_regs(void) * symbol name, and offset to the caller. * * The argument may consist of a numeric value (decimal or - * hexidecimal), a symbol name, a register name (preceeded by the + * hexidecimal), a symbol name, a register name (preceded by the * percent sign), an environment variable with a numeric value - * (preceeded by a dollar sign) or a simple arithmetic expression + * (preceded by a dollar sign) or a simple arithmetic expression * consisting of a symbol name, +/-, and a numeric constant value * (offset). * Parameters: @@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value) * error The hardware-defined error code * reason2 kdb's current reason code. * Initially error but can change - * acording to kdb state. + * according to kdb state. * db_result Result code from break or debug point. * regs The exception frame at time of fault/breakpoint. * should always be valid. @@ -2892,7 +2892,7 @@ static void __init kdb_inittab(void) "Send a signal to a process", 0, KDB_REPEAT_NONE); kdb_register_repeat("summary", kdb_summary, "", "Summarize the system", 4, KDB_REPEAT_NONE); - kdb_register_repeat("per_cpu", kdb_per_cpu, "", + kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", "Display per_cpu variables", 3, KDB_REPEAT_NONE); kdb_register_repeat("grephelp", kdb_grep_help, "", "Display help on | grep", 0, KDB_REPEAT_NONE); diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 6b2485d..5532dd3 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size) * Mask for process state. * Notes: * The mask folds data from several sources into a single long value, so - * be carefull not to overlap the bits. TASK_* bits are in the LSB, + * be careful not to overlap the bits. TASK_* bits are in the LSB, * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there * is no overlap between TASK_* and EXIT_* but that may not always be * true, so EXIT_* bits are shifted left 16 bits before being stored in diff --git a/kernel/exit.c b/kernel/exit.c index f9a45eb..f5d2f63 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -841,7 +841,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* Let father know we died * * Thread signals are configurable, but you aren't going to use - * that to send signals to arbitary processes. + * that to send signals to arbitrary processes. * That stops right now. * * If the parent exec id doesn't match the exec id we saved @@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code) profile_task_exit(tsk); WARN_ON(atomic_read(&tsk->fs_excl)); + WARN_ON(blk_needs_flush_plug(tsk)); if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); diff --git a/kernel/fork.c b/kernel/fork.c index 05b92c4..e7548de 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,6 +40,7 @@ #include <linux/tracehook.h> #include <linux/futex.h> #include <linux/compat.h> +#include <linux/kthread.h> #include <linux/task_io_accounting_ops.h> #include <linux/rcupdate.h> #include <linux/ptrace.h> @@ -109,20 +110,25 @@ int nr_processes(void) } #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR -# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) -# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) +# define alloc_task_struct_node(node) \ + kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) +# define free_task_struct(tsk) \ + kmem_cache_free(task_struct_cachep, (tsk)) static struct kmem_cache *task_struct_cachep; #endif #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR -static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) +static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, + int node) { #ifdef CONFIG_DEBUG_STACK_USAGE gfp_t mask = GFP_KERNEL | __GFP_ZERO; #else gfp_t mask = GFP_KERNEL; #endif - return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); + struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); + + return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) @@ -249,16 +255,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) struct task_struct *tsk; struct thread_info *ti; unsigned long *stackend; - + int node = tsk_fork_get_node(orig); int err; prepare_to_copy(orig); - tsk = alloc_task_struct(); + tsk = alloc_task_struct_node(node); if (!tsk) return NULL; - ti = alloc_thread_info(tsk); + ti = alloc_thread_info_node(tsk, node); if (!ti) { free_task_struct(tsk); return NULL; @@ -1181,12 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, pid = alloc_pid(p->nsproxy->pid_ns); if (!pid) goto bad_fork_cleanup_io; - - if (clone_flags & CLONE_NEWPID) { - retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); - if (retval < 0) - goto bad_fork_free_pid; - } } p->pid = pid_nr(pid); @@ -1205,6 +1205,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; +#ifdef CONFIG_BLOCK + p->plug = NULL; +#endif #ifdef CONFIG_FUTEX p->robust_list = NULL; #ifdef CONFIG_COMPAT @@ -1290,7 +1293,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { - if (clone_flags & CLONE_NEWPID) + if (is_child_reaper(pid)) p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; @@ -1513,38 +1516,24 @@ void __init proc_caches_init(void) } /* - * Check constraints on flags passed to the unshare system call and - * force unsharing of additional process context as appropriate. + * Check constraints on flags passed to the unshare system call. */ -static void check_unshare_flags(unsigned long *flags_ptr) +static int check_unshare_flags(unsigned long unshare_flags) { + if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + return -EINVAL; /* - * If unsharing a thread from a thread group, must also - * unshare vm. - */ - if (*flags_ptr & CLONE_THREAD) - *flags_ptr |= CLONE_VM; - - /* - * If unsharing vm, must also unshare signal handlers. - */ - if (*flags_ptr & CLONE_VM) - *flags_ptr |= CLONE_SIGHAND; - - /* - * If unsharing namespace, must also unshare filesystem information. + * Not implemented, but pretend it works if there is nothing to + * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND + * needs to unshare vm. */ - if (*flags_ptr & CLONE_NEWNS) - *flags_ptr |= CLONE_FS; -} - -/* - * Unsharing of tasks created with CLONE_THREAD is not supported yet - */ -static int unshare_thread(unsigned long unshare_flags) -{ - if (unshare_flags & CLONE_THREAD) - return -EINVAL; + if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { + /* FIXME: get_task_mm() increments ->mm_users */ + if (atomic_read(¤t->mm->mm_users) > 1) + return -EINVAL; + } return 0; } @@ -1571,34 +1560,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) } /* - * Unsharing of sighand is not supported yet - */ -static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) -{ - struct sighand_struct *sigh = current->sighand; - - if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) - return -EINVAL; - else - return 0; -} - -/* - * Unshare vm if it is being shared - */ -static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) -{ - struct mm_struct *mm = current->mm; - - if ((unshare_flags & CLONE_VM) && - (mm && atomic_read(&mm->mm_users) > 1)) { - return -EINVAL; - } - - return 0; -} - -/* * Unshare file descriptor table if it is being shared */ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) @@ -1626,45 +1587,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp */ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { - int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct sighand_struct *new_sigh = NULL; - struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; + int err; - check_unshare_flags(&unshare_flags); - - /* Return -EINVAL for all unsupported flags */ - err = -EINVAL; - if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + err = check_unshare_flags(unshare_flags); + if (err) goto bad_unshare_out; /* + * If unsharing namespace, must also unshare filesystem information. + */ + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old * namespace are unreachable. */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; - if ((err = unshare_thread(unshare_flags))) - goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) - goto bad_unshare_cleanup_thread; - if ((err = unshare_sighand(unshare_flags, &new_sigh))) - goto bad_unshare_cleanup_fs; - if ((err = unshare_vm(unshare_flags, &new_mm))) - goto bad_unshare_cleanup_sigh; + goto bad_unshare_out; if ((err = unshare_fd(unshare_flags, &new_fd))) - goto bad_unshare_cleanup_vm; + goto bad_unshare_cleanup_fs; if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs))) goto bad_unshare_cleanup_fd; - if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -1690,19 +1643,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) spin_unlock(&fs->lock); } - if (new_mm) { - mm = current->mm; - active_mm = current->active_mm; - current->mm = new_mm; - current->active_mm = new_mm; - if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { - atomic_dec(&mm->oom_disable_count); - atomic_inc(&new_mm->oom_disable_count); - } - activate_mm(active_mm, new_mm); - new_mm = mm; - } - if (new_fd) { fd = current->files; current->files = new_fd; @@ -1719,20 +1659,10 @@ bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); -bad_unshare_cleanup_vm: - if (new_mm) - mmput(new_mm); - -bad_unshare_cleanup_sigh: - if (new_sigh) - if (atomic_dec_and_test(&new_sigh->count)) - kmem_cache_free(sighand_cachep, new_sigh); - bad_unshare_cleanup_fs: if (new_fs) free_fs_struct(new_fs); -bad_unshare_cleanup_thread: bad_unshare_out: return err; } diff --git a/kernel/futex.c b/kernel/futex.c index bda4157..fe28dc2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -782,8 +782,8 @@ static void __unqueue_futex(struct futex_q *q) { struct futex_hash_bucket *hb; - if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr) - || plist_node_empty(&q->list))) + if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) + || WARN_ON(plist_node_empty(&q->list))) return; hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); @@ -1886,7 +1886,7 @@ retry: restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; - restart->futex.flags = flags; + restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; ret = -ERESTART_RESTARTBLOCK; @@ -2418,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, goto err_unlock; ret = -EPERM; pcred = __task_cred(p); + /* If victim is in different user_ns, then uids are not + comparable, so we must have CAP_SYS_PTRACE */ + if (cred->user->user_ns != pcred->user->user_ns) { + if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) + goto err_unlock; + goto ok; + } + /* If victim is in same user_ns, then uids are comparable */ if (cred->euid != pcred->euid && cred->euid != pcred->uid && - !capable(CAP_SYS_PTRACE)) + !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) goto err_unlock; +ok: head = p->robust_list; rcu_read_unlock(); } diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index a7934ac..5f9e689 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, goto err_unlock; ret = -EPERM; pcred = __task_cred(p); + /* If victim is in different user_ns, then uids are not + comparable, so we must have CAP_SYS_PTRACE */ + if (cred->user->user_ns != pcred->user->user_ns) { + if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) + goto err_unlock; + goto ok; + } + /* If victim is in same user_ns, then uids are comparable */ if (cred->euid != pcred->euid && cred->euid != pcred->uid && - !capable(CAP_SYS_PTRACE)) + !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) goto err_unlock; +ok: head = p->compat_robust_list; rcu_read_unlock(); } diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 3f76100..e97ca59 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -1,3 +1,3 @@ -EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' +ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o diff --git a/kernel/groups.c b/kernel/groups.c index 253dc0f..1cc476d 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!capable(CAP_SETGID)) + if (!nsown_capable(CAP_SETGID)) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 09bef82..c574f9a 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -10,13 +10,6 @@ menu "IRQ subsystem" config GENERIC_HARDIRQS def_bool y -# Select this to disable the deprecated stuff -config GENERIC_HARDIRQS_NO_DEPRECATED - bool - -config GENERIC_HARDIRQS_NO_COMPAT - bool - # Options selectable by the architecture code # Make sparse irq Kconfig switch below available @@ -31,6 +24,10 @@ config GENERIC_IRQ_PROBE config GENERIC_IRQ_SHOW bool +# Print level/edge extra information +config GENERIC_IRQ_SHOW_LEVEL + bool + # Support for delayed migration from interrupt context config GENERIC_PENDING_IRQ bool @@ -47,6 +44,10 @@ config HARDIRQS_SW_RESEND config IRQ_PREFLOW_FASTEOI bool +# Edge style eoi based handler (cell) +config IRQ_EDGE_EOI_HANDLER + bool + # Support forced irq threading config IRQ_FORCED_THREADING bool diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 394784c..342d8f4 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -70,10 +70,8 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc)) { - irq_compat_set_pending(desc); + if (irq_startup(desc)) desc->istate |= IRQS_PENDING; - } } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c9c0601..4af1e2b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -34,9 +34,14 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) if (!chip) chip = &no_irq_chip; - irq_chip_set_defaults(chip); desc->irq_data.chip = chip; irq_put_desc_unlock(desc, flags); + /* + * For !CONFIG_SPARSE_IRQ make the irq show up in + * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is + * already marked, and this call is harmless. + */ + irq_reserve_irq(irq); return 0; } EXPORT_SYMBOL(irq_set_chip); @@ -134,26 +139,22 @@ EXPORT_SYMBOL_GPL(irq_get_irq_data); static void irq_state_clr_disabled(struct irq_desc *desc) { - desc->istate &= ~IRQS_DISABLED; - irq_compat_clr_disabled(desc); + irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); } static void irq_state_set_disabled(struct irq_desc *desc) { - desc->istate |= IRQS_DISABLED; - irq_compat_set_disabled(desc); + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); } static void irq_state_clr_masked(struct irq_desc *desc) { - desc->istate &= ~IRQS_MASKED; - irq_compat_clr_masked(desc); + irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); } static void irq_state_set_masked(struct irq_desc *desc) { - desc->istate |= IRQS_MASKED; - irq_compat_set_masked(desc); + irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); } int irq_startup(struct irq_desc *desc) @@ -203,126 +204,6 @@ void irq_disable(struct irq_desc *desc) } } -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -/* Temporary migration helpers */ -static void compat_irq_mask(struct irq_data *data) -{ - data->chip->mask(data->irq); -} - -static void compat_irq_unmask(struct irq_data *data) -{ - data->chip->unmask(data->irq); -} - -static void compat_irq_ack(struct irq_data *data) -{ - data->chip->ack(data->irq); -} - -static void compat_irq_mask_ack(struct irq_data *data) -{ - data->chip->mask_ack(data->irq); -} - -static void compat_irq_eoi(struct irq_data *data) -{ - data->chip->eoi(data->irq); -} - -static void compat_irq_enable(struct irq_data *data) -{ - data->chip->enable(data->irq); -} - -static void compat_irq_disable(struct irq_data *data) -{ - data->chip->disable(data->irq); -} - -static void compat_irq_shutdown(struct irq_data *data) -{ - data->chip->shutdown(data->irq); -} - -static unsigned int compat_irq_startup(struct irq_data *data) -{ - return data->chip->startup(data->irq); -} - -static int compat_irq_set_affinity(struct irq_data *data, - const struct cpumask *dest, bool force) -{ - return data->chip->set_affinity(data->irq, dest); -} - -static int compat_irq_set_type(struct irq_data *data, unsigned int type) -{ - return data->chip->set_type(data->irq, type); -} - -static int compat_irq_set_wake(struct irq_data *data, unsigned int on) -{ - return data->chip->set_wake(data->irq, on); -} - -static int compat_irq_retrigger(struct irq_data *data) -{ - return data->chip->retrigger(data->irq); -} - -static void compat_bus_lock(struct irq_data *data) -{ - data->chip->bus_lock(data->irq); -} - -static void compat_bus_sync_unlock(struct irq_data *data) -{ - data->chip->bus_sync_unlock(data->irq); -} -#endif - -/* - * Fixup enable/disable function pointers - */ -void irq_chip_set_defaults(struct irq_chip *chip) -{ -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED - if (chip->enable) - chip->irq_enable = compat_irq_enable; - if (chip->disable) - chip->irq_disable = compat_irq_disable; - if (chip->shutdown) - chip->irq_shutdown = compat_irq_shutdown; - if (chip->startup) - chip->irq_startup = compat_irq_startup; - if (!chip->end) - chip->end = dummy_irq_chip.end; - if (chip->bus_lock) - chip->irq_bus_lock = compat_bus_lock; - if (chip->bus_sync_unlock) - chip->irq_bus_sync_unlock = compat_bus_sync_unlock; - if (chip->mask) - chip->irq_mask = compat_irq_mask; - if (chip->unmask) - chip->irq_unmask = compat_irq_unmask; - if (chip->ack) - chip->irq_ack = compat_irq_ack; - if (chip->mask_ack) - chip->irq_mask_ack = compat_irq_mask_ack; - if (chip->eoi) - chip->irq_eoi = compat_irq_eoi; - if (chip->set_affinity) - chip->irq_set_affinity = compat_irq_set_affinity; - if (chip->set_type) - chip->irq_set_type = compat_irq_set_type; - if (chip->set_wake) - chip->irq_set_wake = compat_irq_set_wake; - if (chip->retrigger) - chip->irq_retrigger = compat_irq_retrigger; -#endif -} - static inline void mask_ack_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_mask_ack) @@ -372,11 +253,10 @@ void handle_nested_irq(unsigned int irq) kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; - if (unlikely(!action || (desc->istate & IRQS_DISABLED))) + if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; - irq_compat_set_progress(desc); - desc->istate |= IRQS_INPROGRESS; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); action_ret = action->thread_fn(action->irq, action->dev_id); @@ -384,8 +264,7 @@ void handle_nested_irq(unsigned int irq) note_interrupt(irq, desc, action_ret); raw_spin_lock_irq(&desc->lock); - desc->istate &= ~IRQS_INPROGRESS; - irq_compat_clr_progress(desc); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); out_unlock: raw_spin_unlock_irq(&desc->lock); @@ -416,14 +295,14 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) if (!irq_check_poll(desc)) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); - if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; handle_irq_event(desc); @@ -448,7 +327,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); mask_ack_irq(desc); - if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) if (!irq_check_poll(desc)) goto out_unlock; @@ -459,12 +338,12 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * keep it masked and get out of here */ - if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; handle_irq_event(desc); - if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT))) + if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); @@ -496,7 +375,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) if (!irq_check_poll(desc)) goto out; @@ -507,8 +386,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * then mask it and get out of here: */ - if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) { - irq_compat_set_pending(desc); + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; mask_irq(desc); goto out; @@ -537,7 +415,7 @@ out: * @desc: the interrupt description structure for this irq * * Interrupt occures on the falling and/or rising edge of a hardware - * signal. The occurence is latched into the irq controller hardware + * signal. The occurrence is latched into the irq controller hardware * and must be acked in order to be reenabled. After the ack another * interrupt can happen on the same source even before the first one * is handled by the associated event handler. If this happens it @@ -558,10 +436,9 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * we shouldn't process the IRQ. Mark it pending, handle * the necessary masking and go out */ - if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) || - !desc->action))) { + if (unlikely(irqd_irq_disabled(&desc->irq_data) || + irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { if (!irq_check_poll(desc)) { - irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; mask_ack_irq(desc); goto out_unlock; @@ -584,20 +461,65 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * Renable it, if it was not disabled in meantime. */ if (unlikely(desc->istate & IRQS_PENDING)) { - if (!(desc->istate & IRQS_DISABLED) && - (desc->istate & IRQS_MASKED)) + if (!irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data)) unmask_irq(desc); } handle_irq_event(desc); } while ((desc->istate & IRQS_PENDING) && - !(desc->istate & IRQS_DISABLED)); + !irqd_irq_disabled(&desc->irq_data)); out_unlock: raw_spin_unlock(&desc->lock); } +#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER +/** + * handle_edge_eoi_irq - edge eoi type IRQ handler + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * + * Similar as the above handle_edge_irq, but using eoi and w/o the + * mask/unmask logic. + */ +void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + + raw_spin_lock(&desc->lock); + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + /* + * If we're currently running this IRQ, or its disabled, + * we shouldn't process the IRQ. Mark it pending, handle + * the necessary masking and go out + */ + if (unlikely(irqd_irq_disabled(&desc->irq_data) || + irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { + if (!irq_check_poll(desc)) { + desc->istate |= IRQS_PENDING; + goto out_eoi; + } + } + kstat_incr_irqs_this_cpu(irq, desc); + + do { + if (unlikely(!desc->action)) + goto out_eoi; + + handle_irq_event(desc); + + } while ((desc->istate & IRQS_PENDING) && + !irqd_irq_disabled(&desc->irq_data)); + +out_eoi: + chip->irq_eoi(&desc->irq_data); + raw_spin_unlock(&desc->lock); +} +#endif + /** * handle_percpu_irq - Per CPU local irq handler * @irq: the interrupt number @@ -642,8 +564,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (handle == handle_bad_irq) { if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); - irq_compat_set_disabled(desc); - desc->istate |= IRQS_DISABLED; + irq_state_set_disabled(desc); desc->depth = 1; } desc->handle_irq = handle; @@ -684,8 +605,70 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irqd_set(&desc->irq_data, IRQD_PER_CPU); if (irq_settings_can_move_pcntxt(desc)) irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); + if (irq_settings_is_level(desc)) + irqd_set(&desc->irq_data, IRQD_LEVEL); irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); irq_put_desc_unlock(desc, flags); } + +/** + * irq_cpu_online - Invoke all irq_cpu_online functions. + * + * Iterate through all irqs and invoke the chip.irq_cpu_online() + * for each. + */ +void irq_cpu_online(void) +{ + struct irq_desc *desc; + struct irq_chip *chip; + unsigned long flags; + unsigned int irq; + + for_each_active_irq(irq) { + desc = irq_to_desc(irq); + if (!desc) + continue; + + raw_spin_lock_irqsave(&desc->lock, flags); + + chip = irq_data_get_irq_chip(&desc->irq_data); + if (chip && chip->irq_cpu_online && + (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || + !irqd_irq_disabled(&desc->irq_data))) + chip->irq_cpu_online(&desc->irq_data); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + } +} + +/** + * irq_cpu_offline - Invoke all irq_cpu_offline functions. + * + * Iterate through all irqs and invoke the chip.irq_cpu_offline() + * for each. + */ +void irq_cpu_offline(void) +{ + struct irq_desc *desc; + struct irq_chip *chip; + unsigned long flags; + unsigned int irq; + + for_each_active_irq(irq) { + desc = irq_to_desc(irq); + if (!desc) + continue; + + raw_spin_lock_irqsave(&desc->lock, flags); + + chip = irq_data_get_irq_chip(&desc->irq_data); + if (chip && chip->irq_cpu_offline && + (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || + !irqd_irq_disabled(&desc->irq_data))) + chip->irq_cpu_offline(&desc->irq_data); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + } +} diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h deleted file mode 100644 index 6bbaf66..0000000 --- a/kernel/irq/compat.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Compat layer for transition period - */ -#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT -static inline void irq_compat_set_progress(struct irq_desc *desc) -{ - desc->status |= IRQ_INPROGRESS; -} - -static inline void irq_compat_clr_progress(struct irq_desc *desc) -{ - desc->status &= ~IRQ_INPROGRESS; -} -static inline void irq_compat_set_disabled(struct irq_desc *desc) -{ - desc->status |= IRQ_DISABLED; -} -static inline void irq_compat_clr_disabled(struct irq_desc *desc) -{ - desc->status &= ~IRQ_DISABLED; -} -static inline void irq_compat_set_pending(struct irq_desc *desc) -{ - desc->status |= IRQ_PENDING; -} - -static inline void irq_compat_clr_pending(struct irq_desc *desc) -{ - desc->status &= ~IRQ_PENDING; -} -static inline void irq_compat_set_masked(struct irq_desc *desc) -{ - desc->status |= IRQ_MASKED; -} - -static inline void irq_compat_clr_masked(struct irq_desc *desc) -{ - desc->status &= ~IRQ_MASKED; -} -static inline void irq_compat_set_move_pending(struct irq_desc *desc) -{ - desc->status |= IRQ_MOVE_PENDING; -} - -static inline void irq_compat_clr_move_pending(struct irq_desc *desc) -{ - desc->status &= ~IRQ_MOVE_PENDING; -} -static inline void irq_compat_set_affinity(struct irq_desc *desc) -{ - desc->status |= IRQ_AFFINITY_SET; -} - -static inline void irq_compat_clr_affinity(struct irq_desc *desc) -{ - desc->status &= ~IRQ_AFFINITY_SET; -} -#else -static inline void irq_compat_set_progress(struct irq_desc *desc) { } -static inline void irq_compat_clr_progress(struct irq_desc *desc) { } -static inline void irq_compat_set_disabled(struct irq_desc *desc) { } -static inline void irq_compat_clr_disabled(struct irq_desc *desc) { } -static inline void irq_compat_set_pending(struct irq_desc *desc) { } -static inline void irq_compat_clr_pending(struct irq_desc *desc) { } -static inline void irq_compat_set_masked(struct irq_desc *desc) { } -static inline void irq_compat_clr_masked(struct irq_desc *desc) { } -static inline void irq_compat_set_move_pending(struct irq_desc *desc) { } -static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { } -static inline void irq_compat_set_affinity(struct irq_desc *desc) { } -static inline void irq_compat_clr_affinity(struct irq_desc *desc) { } -#endif - diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index d1a33b7..306cba3 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -4,8 +4,10 @@ #include <linux/kallsyms.h> -#define P(f) if (desc->status & f) printk("%14s set\n", #f) +#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) #define PS(f) if (desc->istate & f) printk("%14s set\n", #f) +/* FIXME */ +#define PD(f) do { } while (0) static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) { @@ -28,13 +30,15 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) P(IRQ_NOAUTOEN); PS(IRQS_AUTODETECT); - PS(IRQS_INPROGRESS); PS(IRQS_REPLAY); PS(IRQS_WAITING); - PS(IRQS_DISABLED); PS(IRQS_PENDING); - PS(IRQS_MASKED); + + PD(IRQS_INPROGRESS); + PD(IRQS_DISABLED); + PD(IRQS_MASKED); } #undef P #undef PS +#undef PD diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index 20dc547..b5fcd96 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c @@ -31,13 +31,6 @@ static unsigned int noop_ret(struct irq_data *data) return 0; } -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -static void compat_noop(unsigned int irq) { } -#define END_INIT .end = compat_noop -#else -#define END_INIT -#endif - /* * Generic no controller implementation */ @@ -48,7 +41,6 @@ struct irq_chip no_irq_chip = { .irq_enable = noop, .irq_disable = noop, .irq_ack = ack_bad, - END_INIT }; /* @@ -64,5 +56,4 @@ struct irq_chip dummy_irq_chip = { .irq_ack = noop, .irq_mask = noop, .irq_unmask = noop, - END_INIT }; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 517561f..90cb55f 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -175,28 +175,13 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) struct irqaction *action = desc->action; irqreturn_t ret; - irq_compat_clr_pending(desc); desc->istate &= ~IRQS_PENDING; - irq_compat_set_progress(desc); - desc->istate |= IRQS_INPROGRESS; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock(&desc->lock); ret = handle_irq_event_percpu(desc, action); raw_spin_lock(&desc->lock); - desc->istate &= ~IRQS_INPROGRESS; - irq_compat_clr_progress(desc); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); return ret; } - -/** - * handle_IRQ_event - irq action chain handler - * @irq: the interrupt number - * @action: the interrupt action chain for this irq - * - * Handles the action chain of an irq event - */ -irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) -{ - return handle_irq_event_percpu(irq_to_desc(irq), action); -} diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 6c6ec9a..6546431 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -15,10 +15,6 @@ #define istate core_internal_state__do_not_mess_with_it -#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT -# define status status_use_accessors -#endif - extern int noirqdebug; /* @@ -44,38 +40,28 @@ enum { * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt * detection * IRQS_POLL_INPROGRESS - polling in progress - * IRQS_INPROGRESS - Interrupt in progress * IRQS_ONESHOT - irq is not unmasked in primary handler * IRQS_REPLAY - irq is replayed * IRQS_WAITING - irq is waiting - * IRQS_DISABLED - irq is disabled * IRQS_PENDING - irq is pending and replayed later - * IRQS_MASKED - irq is masked * IRQS_SUSPENDED - irq is suspended */ enum { IRQS_AUTODETECT = 0x00000001, IRQS_SPURIOUS_DISABLED = 0x00000002, IRQS_POLL_INPROGRESS = 0x00000008, - IRQS_INPROGRESS = 0x00000010, IRQS_ONESHOT = 0x00000020, IRQS_REPLAY = 0x00000040, IRQS_WAITING = 0x00000080, - IRQS_DISABLED = 0x00000100, IRQS_PENDING = 0x00000200, - IRQS_MASKED = 0x00000400, IRQS_SUSPENDED = 0x00000800, }; -#include "compat.h" #include "debug.h" #include "settings.h" #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) -/* Set default functions for irq_chip structures: */ -extern void irq_chip_set_defaults(struct irq_chip *chip); - extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); @@ -162,13 +148,11 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) static inline void irqd_set_move_pending(struct irq_data *d) { d->state_use_accessors |= IRQD_SETAFFINITY_PENDING; - irq_compat_set_move_pending(irq_data_to_desc(d)); } static inline void irqd_clr_move_pending(struct irq_data *d) { d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; - irq_compat_clr_move_pending(irq_data_to_desc(d)); } static inline void irqd_clear(struct irq_data *d, unsigned int mask) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index dbccc79..2c039c9 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -80,7 +80,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_data.handler_data = NULL; desc->irq_data.msi_desc = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); - desc->istate = IRQS_DISABLED; + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; @@ -198,15 +198,6 @@ err: return -ENOMEM; } -struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) -{ - int res = irq_alloc_descs(irq, irq, 1, node); - - if (res == -EEXIST || res == irq) - return irq_to_desc(irq); - return NULL; -} - static int irq_expand_nr_irqs(unsigned int nr) { if (nr > IRQ_BITMAP_BITS) @@ -247,7 +238,6 @@ int __init early_irq_init(void) struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { - .istate = IRQS_DISABLED, .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), @@ -283,11 +273,6 @@ struct irq_desc *irq_to_desc(unsigned int irq) return (irq < NR_IRQS) ? irq_desc + irq : NULL; } -struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) -{ - return irq_to_desc(irq); -} - static void free_desc(unsigned int irq) { dynamic_irq_cleanup(irq); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0a2aa73..07c1611 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -41,7 +41,7 @@ early_param("threadirqs", setup_forced_irqthreads); void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - unsigned int state; + bool inprogress; if (!desc) return; @@ -53,16 +53,16 @@ void synchronize_irq(unsigned int irq) * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ - while (desc->istate & IRQS_INPROGRESS) + while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ raw_spin_lock_irqsave(&desc->lock, flags); - state = desc->istate; + inprogress = irqd_irq_inprogress(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ - } while (state & IRQS_INPROGRESS); + } while (inprogress); /* * We made sure that no hardirq handler is running. Now verify @@ -112,13 +112,13 @@ void irq_set_thread_affinity(struct irq_desc *desc) } #ifdef CONFIG_GENERIC_PENDING_IRQ -static inline bool irq_can_move_pcntxt(struct irq_desc *desc) +static inline bool irq_can_move_pcntxt(struct irq_data *data) { - return irq_settings_can_move_pcntxt(desc); + return irqd_can_move_in_process_context(data); } -static inline bool irq_move_pending(struct irq_desc *desc) +static inline bool irq_move_pending(struct irq_data *data) { - return irqd_is_setaffinity_pending(&desc->irq_data); + return irqd_is_setaffinity_pending(data); } static inline void irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) @@ -131,43 +131,34 @@ irq_get_pending(struct cpumask *mask, struct irq_desc *desc) cpumask_copy(mask, desc->pending_mask); } #else -static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; } -static inline bool irq_move_pending(struct irq_desc *desc) { return false; } +static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; } +static inline bool irq_move_pending(struct irq_data *data) { return false; } static inline void irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } static inline void irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } #endif -/** - * irq_set_affinity - Set the irq affinity of a given irq - * @irq: Interrupt to set affinity - * @cpumask: cpumask - * - */ -int irq_set_affinity(unsigned int irq, const struct cpumask *mask) +int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_chip *chip = desc->irq_data.chip; - unsigned long flags; + struct irq_chip *chip = irq_data_get_irq_chip(data); + struct irq_desc *desc = irq_data_to_desc(data); int ret = 0; - if (!chip->irq_set_affinity) + if (!chip || !chip->irq_set_affinity) return -EINVAL; - raw_spin_lock_irqsave(&desc->lock, flags); - - if (irq_can_move_pcntxt(desc)) { - ret = chip->irq_set_affinity(&desc->irq_data, mask, false); + if (irq_can_move_pcntxt(data)) { + ret = chip->irq_set_affinity(data, mask, false); switch (ret) { case IRQ_SET_MASK_OK: - cpumask_copy(desc->irq_data.affinity, mask); + cpumask_copy(data->affinity, mask); case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); ret = 0; } } else { - irqd_set_move_pending(&desc->irq_data); + irqd_set_move_pending(data); irq_copy_pending(desc, mask); } @@ -175,8 +166,28 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) kref_get(&desc->affinity_notify->kref); schedule_work(&desc->affinity_notify->work); } - irq_compat_set_affinity(desc); - irqd_set(&desc->irq_data, IRQD_AFFINITY_SET); + irqd_set(data, IRQD_AFFINITY_SET); + + return ret; +} + +/** + * irq_set_affinity - Set the irq affinity of a given irq + * @irq: Interrupt to set affinity + * @mask: cpumask + * + */ +int irq_set_affinity(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + int ret; + + if (!desc) + return -EINVAL; + + raw_spin_lock_irqsave(&desc->lock, flags); + ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } @@ -206,7 +217,7 @@ static void irq_affinity_notify(struct work_struct *work) goto out; raw_spin_lock_irqsave(&desc->lock, flags); - if (irq_move_pending(desc)) + if (irq_move_pending(&desc->irq_data)) irq_get_pending(cpumask, desc); else cpumask_copy(cpumask, desc->irq_data.affinity); @@ -285,10 +296,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) if (cpumask_intersects(desc->irq_data.affinity, cpu_online_mask)) set = desc->irq_data.affinity; - else { - irq_compat_clr_affinity(desc); + else irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); - } } cpumask_and(mask, cpu_online_mask, set); @@ -551,9 +560,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, flags &= IRQ_TYPE_SENSE_MASK; if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { - if (!(desc->istate & IRQS_MASKED)) + if (!irqd_irq_masked(&desc->irq_data)) mask_irq(desc); - if (!(desc->istate & IRQS_DISABLED)) + if (!irqd_irq_disabled(&desc->irq_data)) unmask = 1; } @@ -575,8 +584,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, irqd_set(&desc->irq_data, IRQD_LEVEL); } - if (chip != desc->irq_data.chip) - irq_chip_set_defaults(desc->irq_data.chip); ret = 0; break; default: @@ -651,7 +658,7 @@ again: * irq_wake_thread(). See the comment there which explains the * serialization. */ - if (unlikely(desc->istate & IRQS_INPROGRESS)) { + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); cpu_relax(); @@ -668,12 +675,10 @@ again: desc->threads_oneshot &= ~action->thread_mask; - if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) && - (desc->istate & IRQS_MASKED)) { - irq_compat_clr_masked(desc); - desc->istate &= ~IRQS_MASKED; - desc->irq_data.chip->irq_unmask(&desc->irq_data); - } + if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data)) + unmask_irq(desc); + out_unlock: raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); @@ -767,7 +772,7 @@ static int irq_thread(void *data) atomic_inc(&desc->threads_active); raw_spin_lock_irq(&desc->lock); - if (unlikely(desc->istate & IRQS_DISABLED)) { + if (unlikely(irqd_irq_disabled(&desc->irq_data))) { /* * CHECKME: We might need a dedicated * IRQ_THREAD_PENDING flag here, which @@ -775,7 +780,6 @@ static int irq_thread(void *data) * but AFAICT IRQS_PENDING should be fine as it * retriggers the interrupt itself --- tglx */ - irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; raw_spin_unlock_irq(&desc->lock); } else { @@ -971,8 +975,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->thread_mask = 1 << ffz(thread_mask); if (!shared) { - irq_chip_set_defaults(desc->irq_data.chip); - init_waitqueue_head(&desc->wait_for_threads); /* Setup the type (level, edge polarity) if configured: */ @@ -985,8 +987,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ - IRQS_INPROGRESS | IRQS_ONESHOT | \ - IRQS_WAITING); + IRQS_ONESHOT | IRQS_WAITING); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); if (new->flags & IRQF_PERCPU) { irqd_set(&desc->irq_data, IRQD_PER_CPU); @@ -1049,6 +1051,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); + free_cpumask_var(mask); return 0; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index ec4806d..4742090 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -35,7 +35,7 @@ void irq_move_masked_irq(struct irq_data *idata) * do the disable, re-program, enable sequence. * This is *not* particularly important for level triggered * but in a edge trigger case, we might be setting rte - * when an active trigger is comming in. This could + * when an active trigger is coming in. This could * cause some ioapics to mal-function. * Being paranoid i guess! * @@ -53,20 +53,14 @@ void irq_move_masked_irq(struct irq_data *idata) cpumask_clear(desc->pending_mask); } -void move_masked_irq(int irq) -{ - irq_move_masked_irq(irq_get_irq_data(irq)); -} - void irq_move_irq(struct irq_data *idata) { - struct irq_desc *desc = irq_data_to_desc(idata); bool masked; if (likely(!irqd_is_setaffinity_pending(idata))) return; - if (unlikely(desc->istate & IRQS_DISABLED)) + if (unlikely(irqd_irq_disabled(idata))) return; /* @@ -74,15 +68,10 @@ void irq_move_irq(struct irq_data *idata) * threaded interrupt with ONESHOT set, we can end up with an * interrupt storm. */ - masked = desc->istate & IRQS_MASKED; + masked = irqd_irq_masked(idata); if (!masked) idata->chip->irq_mask(idata); irq_move_masked_irq(idata); if (!masked) idata->chip->irq_unmask(idata); } - -void move_native_irq(int irq) -{ - irq_move_irq(irq_get_irq_data(irq)); -} diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 760248d..dd201bd 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -364,6 +364,10 @@ int __weak arch_show_interrupts(struct seq_file *p, int prec) return 0; } +#ifndef ACTUAL_NR_IRQS +# define ACTUAL_NR_IRQS nr_irqs +#endif + int show_interrupts(struct seq_file *p, void *v) { static int prec; @@ -373,10 +377,10 @@ int show_interrupts(struct seq_file *p, void *v) struct irqaction *action; struct irq_desc *desc; - if (i > nr_irqs) + if (i > ACTUAL_NR_IRQS) return 0; - if (i == nr_irqs) + if (i == ACTUAL_NR_IRQS) return arch_show_interrupts(p, prec); /* print header and calculate the width of the first column */ @@ -404,7 +408,20 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, "%*d: ", prec, i); for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); - seq_printf(p, " %8s", desc->irq_data.chip->name); + + if (desc->irq_data.chip) { + if (desc->irq_data.chip->irq_print_chip) + desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); + else if (desc->irq_data.chip->name) + seq_printf(p, " %8s", desc->irq_data.chip->name); + else + seq_printf(p, " %8s", "-"); + } else { + seq_printf(p, " %8s", "None"); + } +#ifdef CONFIG_GENIRC_IRQ_SHOW_LEVEL + seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); +#endif if (desc->name) seq_printf(p, "-%-8s", desc->name); diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index ad683a9..14dd576 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -65,7 +65,6 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if (desc->istate & IRQS_REPLAY) return; if (desc->istate & IRQS_PENDING) { - irq_compat_clr_pending(desc); desc->istate &= ~IRQS_PENDING; desc->istate |= IRQS_REPLAY; diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 0227ad3..0d91730 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -15,17 +15,8 @@ enum { _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; -#define IRQ_INPROGRESS GOT_YOU_MORON -#define IRQ_REPLAY GOT_YOU_MORON -#define IRQ_WAITING GOT_YOU_MORON -#define IRQ_DISABLED GOT_YOU_MORON -#define IRQ_PENDING GOT_YOU_MORON -#define IRQ_MASKED GOT_YOU_MORON -#define IRQ_WAKEUP GOT_YOU_MORON -#define IRQ_MOVE_PENDING GOT_YOU_MORON #define IRQ_PER_CPU GOT_YOU_MORON #define IRQ_NO_BALANCING GOT_YOU_MORON -#define IRQ_AFFINITY_SET GOT_YOU_MORON #define IRQ_LEVEL GOT_YOU_MORON #define IRQ_NOPROBE GOT_YOU_MORON #define IRQ_NOREQUEST GOT_YOU_MORON @@ -37,102 +28,98 @@ enum { static inline void irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) { - desc->status &= ~(clr & _IRQF_MODIFY_MASK); - desc->status |= (set & _IRQF_MODIFY_MASK); + desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK); + desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK); } static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) { - return desc->status & _IRQ_PER_CPU; + return desc->status_use_accessors & _IRQ_PER_CPU; } static inline void irq_settings_set_per_cpu(struct irq_desc *desc) { - desc->status |= _IRQ_PER_CPU; + desc->status_use_accessors |= _IRQ_PER_CPU; } static inline void irq_settings_set_no_balancing(struct irq_desc *desc) { - desc->status |= _IRQ_NO_BALANCING; + desc->status_use_accessors |= _IRQ_NO_BALANCING; } static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) { - return desc->status & _IRQ_NO_BALANCING; + return desc->status_use_accessors & _IRQ_NO_BALANCING; } static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) { - return desc->status & IRQ_TYPE_SENSE_MASK; + return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK; } static inline void irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask) { - desc->status &= ~IRQ_TYPE_SENSE_MASK; - desc->status |= mask & IRQ_TYPE_SENSE_MASK; + desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK; + desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK; } static inline bool irq_settings_is_level(struct irq_desc *desc) { - return desc->status & _IRQ_LEVEL; + return desc->status_use_accessors & _IRQ_LEVEL; } static inline void irq_settings_clr_level(struct irq_desc *desc) { - desc->status &= ~_IRQ_LEVEL; + desc->status_use_accessors &= ~_IRQ_LEVEL; } static inline void irq_settings_set_level(struct irq_desc *desc) { - desc->status |= _IRQ_LEVEL; + desc->status_use_accessors |= _IRQ_LEVEL; } static inline bool irq_settings_can_request(struct irq_desc *desc) { - return !(desc->status & _IRQ_NOREQUEST); + return !(desc->status_use_accessors & _IRQ_NOREQUEST); } static inline void irq_settings_clr_norequest(struct irq_desc *desc) { - desc->status &= ~_IRQ_NOREQUEST; + desc->status_use_accessors &= ~_IRQ_NOREQUEST; } static inline void irq_settings_set_norequest(struct irq_desc *desc) { - desc->status |= _IRQ_NOREQUEST; + desc->status_use_accessors |= _IRQ_NOREQUEST; } static inline bool irq_settings_can_probe(struct irq_desc *desc) { - return !(desc->status & _IRQ_NOPROBE); + return !(desc->status_use_accessors & _IRQ_NOPROBE); } static inline void irq_settings_clr_noprobe(struct irq_desc *desc) { - desc->status &= ~_IRQ_NOPROBE; + desc->status_use_accessors &= ~_IRQ_NOPROBE; } static inline void irq_settings_set_noprobe(struct irq_desc *desc) { - desc->status |= _IRQ_NOPROBE; + desc->status_use_accessors |= _IRQ_NOPROBE; } static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) { - return desc->status & _IRQ_MOVE_PCNTXT; + return desc->status_use_accessors & _IRQ_MOVE_PCNTXT; } static inline bool irq_settings_can_autoenable(struct irq_desc *desc) { - return !(desc->status & _IRQ_NOAUTOEN); + return !(desc->status_use_accessors & _IRQ_NOAUTOEN); } static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) { - return desc->status & _IRQ_NESTED_THREAD; + return desc->status_use_accessors & _IRQ_NESTED_THREAD; } - -/* Nothing should touch desc->status from now on */ -#undef status -#define status USE_THE_PROPER_WRAPPERS_YOU_MORON diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dd586eb..dfbd550 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -45,12 +45,12 @@ bool irq_wait_for_poll(struct irq_desc *desc) #ifdef CONFIG_SMP do { raw_spin_unlock(&desc->lock); - while (desc->istate & IRQS_INPROGRESS) + while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); raw_spin_lock(&desc->lock); - } while (desc->istate & IRQS_INPROGRESS); + } while (irqd_irq_inprogress(&desc->irq_data)); /* Might have been disabled in meantime */ - return !(desc->istate & IRQS_DISABLED) && desc->action; + return !irqd_irq_disabled(&desc->irq_data) && desc->action; #else return false; #endif @@ -75,7 +75,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) * Do not poll disabled interrupts unless the spurious * disabled poller asks explicitely. */ - if ((desc->istate & IRQS_DISABLED) && !force) + if (irqd_irq_disabled(&desc->irq_data) && !force) goto out; /* @@ -88,12 +88,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) goto out; /* Already running on another processor */ - if (desc->istate & IRQS_INPROGRESS) { + if (irqd_irq_inprogress(&desc->irq_data)) { /* * Already running: If it is shared get the other * CPU to go looking for our mystery interrupt too */ - irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; goto out; } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6f6d091..079f1d3 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr) if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || arch_is_kernel_text(addr)) return 1; - return in_gate_area_no_task(addr); + return in_gate_area_no_mm(addr); } static inline int is_kernel(unsigned long addr) { if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) return 1; - return in_gate_area_no_task(addr); + return in_gate_area_no_mm(addr); } static int is_ksym_addr(unsigned long addr) @@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, } /* Look up a kernel symbol and return it in a text buffer. */ -int sprint_symbol(char *buffer, unsigned long address) +static int __sprint_symbol(char *buffer, unsigned long address, + int symbol_offset) { char *modname; const char *name; unsigned long offset, size; int len; + address += symbol_offset; name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) return sprintf(buffer, "0x%lx", address); @@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address) strcpy(buffer, name); len = strlen(buffer); buffer += len; + offset -= symbol_offset; if (modname) - len += sprintf(buffer, "+%#lx/%#lx [%s]", - offset, size, modname); + len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); else len += sprintf(buffer, "+%#lx/%#lx", offset, size); return len; } + +/** + * sprint_symbol - Look up a kernel symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function looks up a kernel symbol with @address and stores its name, + * offset, size and module name to @buffer if possible. If no symbol was found, + * just saves its @address as is. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_symbol(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, 0); +} + EXPORT_SYMBOL_GPL(sprint_symbol); +/** + * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function is for stack backtrace and does the same thing as + * sprint_symbol() but with modified/decreased @address. If there is a + * tail-call to the function marked "noreturn", gcc optimized out code after + * the call so that the stack-saved return address could point outside of the + * caller. This function ensures that kallsyms will find the original caller + * by decreasing @address. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_backtrace(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, -1); +} + /* Look up a kernel symbol and print it to the kernel messages. */ void __print_symbol(const char *fmt, unsigned long address) { @@ -477,13 +515,11 @@ static int s_show(struct seq_file *m, void *p) */ type = iter->exported ? toupper(iter->type) : tolower(iter->type); - seq_printf(m, "%0*lx %c %s\t[%s]\n", - (int)(2 * sizeof(void *)), - iter->value, type, iter->name, iter->module_name); + seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value, + type, iter->name, iter->module_name); } else - seq_printf(m, "%0*lx %c %s\n", - (int)(2 * sizeof(void *)), - iter->value, iter->type, iter->name); + seq_printf(m, "%pK %c %s\n", (void *)iter->value, + iter->type, iter->name); return 0; } diff --git a/kernel/kexec.c b/kernel/kexec.c index ec19b92..87b77de 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -33,6 +33,7 @@ #include <linux/vmalloc.h> #include <linux/swap.h> #include <linux/kmsg_dump.h> +#include <linux/syscore_ops.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -144,7 +145,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, /* Initialize the list of destination pages */ INIT_LIST_HEAD(&image->dest_pages); - /* Initialize the list of unuseable pages */ + /* Initialize the list of unusable pages */ INIT_LIST_HEAD(&image->unuseable_pages); /* Read in the segments */ @@ -454,7 +455,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, /* Deal with the destination pages I have inadvertently allocated. * * Ideally I would convert multi-page allocations into single - * page allocations, and add everyting to image->dest_pages. + * page allocations, and add everything to image->dest_pages. * * For now it is simpler to just free the pages. */ @@ -602,7 +603,7 @@ static void kimage_free_extra_pages(struct kimage *image) /* Walk through and free any extra destination pages I may have */ kimage_free_page_list(&image->dest_pages); - /* Walk through and free any unuseable pages I have cached */ + /* Walk through and free any unusable pages I have cached */ kimage_free_page_list(&image->unuseable_pages); } @@ -1099,7 +1100,8 @@ size_t crash_get_memory_size(void) return size; } -static void free_reserved_phys_range(unsigned long begin, unsigned long end) +void __weak crash_free_reserved_phys_range(unsigned long begin, + unsigned long end) { unsigned long addr; @@ -1135,7 +1137,7 @@ int crash_shrink_memory(unsigned long new_size) start = roundup(start, PAGE_SIZE); end = roundup(start + new_size, PAGE_SIZE); - free_reserved_phys_range(end, crashk_res.end); + crash_free_reserved_phys_range(end, crashk_res.end); if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); @@ -1531,6 +1533,11 @@ int kernel_kexec(void) local_irq_disable(); /* Suspend system devices */ error = sysdev_suspend(PMSG_FREEZE); + if (!error) { + error = syscore_suspend(); + if (error) + sysdev_resume(); + } if (error) goto Enable_irqs; } else @@ -1545,6 +1552,7 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { + syscore_resume(); sysdev_resume(); Enable_irqs: local_irq_enable(); diff --git a/kernel/kthread.c b/kernel/kthread.c index c55afba..3b34d27 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -27,6 +27,7 @@ struct kthread_create_info /* Information passed to kthread() from kthreadd. */ int (*threadfn)(void *data); void *data; + int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; @@ -98,10 +99,23 @@ static int kthread(void *_create) do_exit(ret); } +/* called from do_fork() to get node information for about to be created task */ +int tsk_fork_get_node(struct task_struct *tsk) +{ +#ifdef CONFIG_NUMA + if (tsk == kthreadd_task) + return tsk->pref_node_fork; +#endif + return numa_node_id(); +} + static void create_kthread(struct kthread_create_info *create) { int pid; +#ifdef CONFIG_NUMA + current->pref_node_fork = create->node; +#endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { @@ -111,33 +125,38 @@ static void create_kthread(struct kthread_create_info *create) } /** - * kthread_create - create a kthread. + * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. + * @node: memory node number. * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). * + * If thread is going to be bound on a particular cpu, give its node + * in @node, to get NUMA affinity for kthread stack, or else give -1. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either call do_exit() directly if it is a - * standalone thread for which noone will call kthread_stop(), or + * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * * Returns a task_struct or ERR_PTR(-ENOMEM). */ -struct task_struct *kthread_create(int (*threadfn)(void *data), - void *data, - const char namefmt[], - ...) +struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), + void *data, + int node, + const char namefmt[], + ...) { struct kthread_create_info create; create.threadfn = threadfn; create.data = data; + create.node = node; init_completion(&create.done); spin_lock(&kthread_create_lock); @@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } return create.result; } -EXPORT_SYMBOL(kthread_create); +EXPORT_SYMBOL(kthread_create_on_node); /** * kthread_bind - bind a just-created kthread to a cpu. diff --git a/kernel/latencytop.c b/kernel/latencytop.c index ee74b35..376066e 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk, } /** - * __account_scheduler_latency - record an occured latency + * __account_scheduler_latency - record an occurred latency * @tsk - the task struct of the task hitting the latency * @usecs - the duration of the latency in microseconds * @inter - 1 if the sleep was interruptible, 0 if uninterruptible diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0d2058d..53a6895 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2309,7 +2309,7 @@ void trace_hardirqs_on_caller(unsigned long ip) if (unlikely(curr->hardirqs_enabled)) { /* * Neither irq nor preemption are disabled here - * so this is racy by nature but loosing one hit + * so this is racy by nature but losing one hit * in a stat is not a big deal. */ __debug_atomic_inc(redundant_hardirqs_on); @@ -2620,7 +2620,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, if (!graph_lock()) return 0; /* - * Make sure we didnt race: + * Make sure we didn't race: */ if (unlikely(hlock_class(this)->usage_mask & new_mask)) { graph_unlock(); diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 1969d2f..71edd2f 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, - sum_forward_deps = 0, factor = 0; + sum_forward_deps = 0; list_for_each_entry(class, &all_lock_classes, lock_entry) { @@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_hardirq_unsafe * nr_hardirq_safe + nr_list_entries); - /* - * Estimated factor between direct and indirect - * dependencies: - */ - if (nr_list_entries) - factor = sum_forward_deps / nr_list_entries; - #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", nr_lock_chains, MAX_LOCKDEP_CHAINS); diff --git a/kernel/module.c b/kernel/module.c index efa290e..d5938a5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -809,7 +809,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, wait_for_zero_refcount(mod); mutex_unlock(&module_mutex); - /* Final destruction now noone is using it. */ + /* Final destruction now no one is using it. */ if (mod->exit != NULL) mod->exit(); blocking_notifier_call_chain(&module_notify_list, @@ -1168,7 +1168,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr, { struct module_sect_attr *sattr = container_of(mattr, struct module_sect_attr, mattr); - return sprintf(buf, "0x%lx\n", sattr->address); + return sprintf(buf, "0x%pK\n", (void *)sattr->address); } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -2777,7 +2777,7 @@ static struct module *load_module(void __user *umod, mod->state = MODULE_STATE_COMING; /* Now sew it into the lists so we can get lockdep and oops - * info during argument parsing. Noone should access us, since + * info during argument parsing. No one should access us, since * strong_try_module_get() will fail. * lockdep/oops can run asynchronous, so use the RCU list insertion * function to insert in a way safe to concurrent readers. @@ -2971,7 +2971,7 @@ static const char *get_ksymbol(struct module *mod, else nextval = (unsigned long)mod->module_core+mod->core_text_size; - /* Scan for closest preceeding symbol, and next symbol. (ELF + /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ for (i = 1; i < mod->num_symtab; i++) { if (mod->symtab[i].st_shndx == SHN_UNDEF) @@ -3224,7 +3224,7 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading": "Live"); /* Used by oprofile and other similar tools. */ - seq_printf(m, " 0x%p", mod->module_core); + seq_printf(m, " 0x%pK", mod->module_core); /* Taints info */ if (mod->taints) diff --git a/kernel/mutex.c b/kernel/mutex.c index a5889fb..c4195fa 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -245,7 +245,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } __set_task_state(task, state); - /* didnt get the lock, go to sleep: */ + /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); preempt_enable_no_resched(); schedule(); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f74e6c0..a05d191 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -69,13 +69,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_ns; } - new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); + new_nsp->uts_ns = copy_utsname(flags, tsk); if (IS_ERR(new_nsp->uts_ns)) { err = PTR_ERR(new_nsp->uts_ns); goto out_uts; } - new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); + new_nsp->ipc_ns = copy_ipcs(flags, tsk); if (IS_ERR(new_nsp->ipc_ns)) { err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; diff --git a/kernel/padata.c b/kernel/padata.c index 7510194..b91941d 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd) /* * This cpu has to do the parallel processing of the next * object. It's waiting in the cpu's parallelization queue, - * so exit imediately. + * so exit immediately. */ if (PTR_ERR(padata) == -ENODATA) { del_timer(&pd->timer); @@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd) /* * The next object that needs serialization might have arrived to * the reorder queues in the meantime, we will be called again - * from the timer function if noone else cares for it. + * from the timer function if no one else cares for it. */ if (atomic_read(&pd->reorder_objects) && !(pinst->flags & PADATA_RESET)) @@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst) put_online_cpus(); } -/* Replace the internal control stucture with a new one. */ +/* Replace the internal control structure with a new one. */ static void padata_replace(struct padata_instance *pinst, struct parallel_data *pd_new) { @@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) } /** - * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) + * padata_remove_cpu - remove a cpu from the one or both(serial and parallel) * padata cpumasks. * * @pinst: padata instance diff --git a/kernel/panic.c b/kernel/panic.c index 991bb87..6923167 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); diff --git a/kernel/params.c b/kernel/params.c index 0da1411..7ab388a 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -95,7 +95,7 @@ static int parse_one(char *param, /* Find parameter */ for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { - /* Noone handled NULL, so do it here. */ + /* No one handled NULL, so do it here. */ if (!val && params[i].ops->set != param_set_bool) return -EINVAL; DEBUGP("They are equal! Calling %p\n", diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3472bb1..8e81a98 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -145,7 +145,8 @@ static struct srcu_struct pmus_srcu; */ int sysctl_perf_event_paranoid __read_mostly = 1; -int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ +/* Minimum for 512 kiB + 1 user control page */ +int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ /* * max perf event sample rate @@ -363,6 +364,7 @@ void perf_cgroup_switch(struct task_struct *task, int mode) } if (mode & PERF_CGROUP_SWIN) { + WARN_ON_ONCE(cpuctx->cgrp); /* set cgrp before ctxsw in to * allow event_filter_match() to not * have to pass task around @@ -941,6 +943,7 @@ static void perf_group_attach(struct perf_event *event) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { + struct perf_cpu_context *cpuctx; /* * We can have double detach due to exit/hot-unplug + close. */ @@ -949,8 +952,17 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - if (is_cgroup_event(event)) + if (is_cgroup_event(event)) { ctx->nr_cgroups--; + cpuctx = __get_cpu_context(ctx); + /* + * if there are no more cgroup events + * then cler cgrp to avoid stale pointer + * in update_cgrp_time_from_cpuctx() + */ + if (!ctx->nr_cgroups) + cpuctx->cgrp = NULL; + } ctx->nr_events--; if (event->attr.inherit_stat) @@ -2412,6 +2424,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) if (!ctx || !ctx->nr_events) goto out; + /* + * We must ctxsw out cgroup events to avoid conflict + * when invoking perf_task_event_sched_in() later on + * in this function. Otherwise we end up trying to + * ctxswin cgroup events which are already scheduled + * in. + */ + perf_cgroup_sched_out(current); task_ctx_sched_out(ctx, EVENT_ALL); raw_spin_lock(&ctx->lock); @@ -2436,6 +2456,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); + /* + * Also calls ctxswin for cgroup events, if any: + */ perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); @@ -6520,6 +6543,11 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } + if (task) { + put_task_struct(task); + task = NULL; + } + /* * Look up the group leader (we will attach this event to it): */ diff --git a/kernel/pid.c b/kernel/pid.c index 02f2212..57a8346 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) return -1; } -int next_pidmap(struct pid_namespace *pid_ns, int last) +int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) { int offset; struct pidmap *map, *end; + if (last >= PID_MAX_LIMIT) + return -1; + offset = (last + 1) & BITS_PER_PAGE_MASK; map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; end = &pid_ns->pidmap[PIDMAP_ENTRIES]; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a5aff94..e9c9adc 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -14,6 +14,7 @@ #include <linux/err.h> #include <linux/acct.h> #include <linux/slab.h> +#include <linux/proc_fs.h> #define BITS_PER_PAGE (PAGE_SIZE*8) @@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; - int i; + int i, err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) @@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); + err = pid_ns_prepare_proc(ns); + if (err) + goto out_put_parent_pid_ns; + return ns; +out_put_parent_pid_ns: + put_pid_ns(parent_pid_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: kmem_cache_free(pid_ns_cachep, ns); out: - return ERR_PTR(-ENOMEM); + return ERR_PTR(err); } static void destroy_pid_namespace(struct pid_namespace *ns) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 67fea9d..0791b13 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1347,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) /* * Now that all the timers on our list have the firing flag, - * noone will touch their list entries but us. We'll take + * no one will touch their list entries but us. We'll take * each timer's lock before clearing its firing flag, so no * timer call will interfere. */ diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 4c01249..e5498d7 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -313,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr) * restarted (i.e. we have flagged this in the sys_private entry of the * info block). * - * To protect aginst the timer going away while the interrupt is queued, + * To protect against the timer going away while the interrupt is queued, * we require that the it_requeue_pending flag be set. */ void do_schedule_next_timer(struct siginfo *info) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 4603f08..6de9a8f 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -18,9 +18,13 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. +config HIBERNATE_CALLBACKS + bool + config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on SWAP && ARCH_HIBERNATION_POSSIBLE + select HIBERNATE_CALLBACKS select LZO_COMPRESS select LZO_DECOMPRESS ---help--- @@ -85,7 +89,7 @@ config PM_STD_PARTITION config PM_SLEEP def_bool y - depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE + depends on SUSPEND || HIBERNATE_CALLBACKS config PM_SLEEP_SMP def_bool y diff --git a/kernel/power/Makefile b/kernel/power/Makefile index c350e18..c5ebc6a 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,4 +1,5 @@ -ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG + +ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG obj-$(CONFIG_PM) += main.o obj-$(CONFIG_PM_SLEEP) += console.o diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c index 83bbc7c..d09dd10 100644 --- a/kernel/power/block_io.c +++ b/kernel/power/block_io.c @@ -28,7 +28,7 @@ static int submit(int rw, struct block_device *bdev, sector_t sector, struct page *page, struct bio **bio_chain) { - const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; + const int bio_rw = rw | REQ_SYNC; struct bio *bio; bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index aeabd26..50aae66 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -273,8 +273,11 @@ static int create_image(int platform_mode) local_irq_disable(); error = sysdev_suspend(PMSG_FREEZE); - if (!error) + if (!error) { error = syscore_suspend(); + if (error) + sysdev_resume(); + } if (error) { printk(KERN_ERR "PM: Some system devices failed to power down, " "aborting hibernation\n"); @@ -407,8 +410,11 @@ static int resume_target_kernel(bool platform_mode) local_irq_disable(); error = sysdev_suspend(PMSG_QUIESCE); - if (!error) + if (!error) { error = syscore_suspend(); + if (error) + sysdev_resume(); + } if (error) goto Enable_irqs; diff --git a/kernel/power/main.c b/kernel/power/main.c index 8eaba5f..de9aef8 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -224,7 +224,7 @@ power_attr(state); * writing to 'state'. It first should read from 'wakeup_count' and store * the read value. Then, after carrying out its own preparations for the system * transition to a sleep state, it should write the stored value to - * 'wakeup_count'. If that fails, at least one wakeup event has occured since + * 'wakeup_count'. If that fails, at least one wakeup event has occurred since * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it * is allowed to write to 'state', but the transition will be aborted if there * are any wakeup events detected after 'wakeup_count' was written to. diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 2814c32..8935369 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -164,8 +164,11 @@ static int suspend_enter(suspend_state_t state) BUG_ON(!irqs_disabled()); error = sysdev_suspend(PMSG_SUSPEND); - if (!error) + if (!error) { error = syscore_suspend(); + if (error) + sysdev_resume(); + } if (!error) { if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { error = suspend_ops->enter(state); diff --git a/kernel/printk.c b/kernel/printk.c index 33284ad..da8ca81 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -53,7 +53,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) /* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ +#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL /* We show everything that is MORE important than this.. */ #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ @@ -113,6 +113,11 @@ static unsigned con_start; /* Index into log_buf: next char to be sent to consol static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ /* + * If exclusive_console is non-NULL then only this console is to be printed to. + */ +static struct console *exclusive_console; + +/* * Array of consoles built from command line options (console=) */ struct console_cmdline @@ -476,6 +481,8 @@ static void __call_console_drivers(unsigned start, unsigned end) struct console *con; for_each_console(con) { + if (exclusive_console && con != exclusive_console) + continue; if ((con->flags & CON_ENABLED) && con->write && (cpu_online(smp_processor_id()) || (con->flags & CON_ANYTIME))) @@ -1230,6 +1237,11 @@ void console_unlock(void) local_irq_restore(flags); } console_locked = 0; + + /* Release the exclusive_console once it is used */ + if (unlikely(exclusive_console)) + exclusive_console = NULL; + up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd) @@ -1316,6 +1328,18 @@ void console_start(struct console *console) } EXPORT_SYMBOL(console_start); +static int __read_mostly keep_bootcon; + +static int __init keep_bootcon_setup(char *str) +{ + keep_bootcon = 1; + printk(KERN_INFO "debug: skip boot console de-registration.\n"); + + return 0; +} + +early_param("keep_bootcon", keep_bootcon_setup); + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -1452,6 +1476,12 @@ void register_console(struct console *newcon) spin_lock_irqsave(&logbuf_lock, flags); con_start = log_start; spin_unlock_irqrestore(&logbuf_lock, flags); + /* + * We're about to replay the log buffer. Only do this to the + * just-registered console to avoid excessive message spam to + * the already-registered consoles. + */ + exclusive_console = newcon; } console_unlock(); console_sysfs_notify(); @@ -1463,7 +1493,9 @@ void register_console(struct console *newcon) * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { + if (bcon && + ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && + !keep_bootcon) { /* we need to iterate through twice, to make sure we print * everything out, before we unregister the console(s) */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e2302e4..0fc1eed 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -134,21 +134,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) return 0; rcu_read_lock(); tcred = __task_cred(task); - if ((cred->uid != tcred->euid || - cred->uid != tcred->suid || - cred->uid != tcred->uid || - cred->gid != tcred->egid || - cred->gid != tcred->sgid || - cred->gid != tcred->gid) && - !capable(CAP_SYS_PTRACE)) { - rcu_read_unlock(); - return -EPERM; - } + if (cred->user->user_ns == tcred->user->user_ns && + (cred->uid == tcred->euid && + cred->uid == tcred->suid && + cred->uid == tcred->uid && + cred->gid == tcred->egid && + cred->gid == tcred->sgid && + cred->gid == tcred->gid)) + goto ok; + if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE)) + goto ok; + rcu_read_unlock(); + return -EPERM; +ok: rcu_read_unlock(); smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - if (!dumpable && !capable(CAP_SYS_PTRACE)) + if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) return -EPERM; return security_ptrace_access_check(task, mode); @@ -198,7 +201,7 @@ static int ptrace_attach(struct task_struct *task) goto unlock_tasklist; task->ptrace = PT_PTRACED; - if (capable(CAP_SYS_PTRACE)) + if (task_ns_capable(task, CAP_SYS_PTRACE)) task->ptrace |= PT_PTRACE_CAP; __ptrace_link(task, current); diff --git a/kernel/res_counter.c b/kernel/res_counter.c index c7eaa37..34683ef 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member, pos, buf, s - buf); } +#if BITS_PER_LONG == 32 +u64 res_counter_read_u64(struct res_counter *counter, int member) +{ + unsigned long flags; + u64 ret; + + spin_lock_irqsave(&counter->lock, flags); + ret = *res_counter_member(counter, member); + spin_unlock_irqrestore(&counter->lock, flags); + + return ret; +} +#else u64 res_counter_read_u64(struct res_counter *counter, int member) { return *res_counter_member(counter, member); } +#endif int res_counter_memparse_write_strategy(const char *buf, unsigned long long *res) diff --git a/kernel/sched.c b/kernel/sched.c index a172494..312f8b9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2309,7 +2309,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * Cause a process which is running on another CPU to enter * kernel-mode, without any delay. (to get signals handled.) * - * NOTE: this function doesnt have to take the runqueue lock, + * NOTE: this function doesn't have to take the runqueue lock, * because all it wants to ensure is that the remote task enters * the kernel. If the IPI races and the task has been migrated * to another CPU then no harm is done and the purpose has been @@ -4111,6 +4111,16 @@ need_resched: try_to_wake_up_local(to_wakeup); } deactivate_task(rq, prev, DEQUEUE_SLEEP); + + /* + * If we are going to sleep and we have plugged IO queued, make + * sure to submit it to avoid deadlocks. + */ + if (blk_needs_flush_plug(prev)) { + raw_spin_unlock(&rq->lock); + blk_schedule_flush_plug(prev); + raw_spin_lock(&rq->lock); + } } switch_count = &prev->nvcsw; } @@ -4892,8 +4902,11 @@ static bool check_same_owner(struct task_struct *p) rcu_read_lock(); pcred = __task_cred(p); - match = (cred->euid == pcred->euid || - cred->euid == pcred->uid); + if (cred->user->user_ns == pcred->user->user_ns) + match = (cred->euid == pcred->euid || + cred->euid == pcred->uid); + else + match = false; rcu_read_unlock(); return match; } @@ -4984,7 +4997,7 @@ recheck: */ raw_spin_lock_irqsave(&p->pi_lock, flags); /* - * To be able to change p->policy safely, the apropriate + * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ rq = __task_rq_lock(p); @@ -4998,6 +5011,17 @@ recheck: return -EINVAL; } + /* + * If not changing anything there's no need to proceed further: + */ + if (unlikely(policy == p->policy && (!rt_policy(policy) || + param->sched_priority == p->rt_priority))) { + + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return 0; + } + #ifdef CONFIG_RT_GROUP_SCHED if (user) { /* @@ -5221,7 +5245,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) + if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p); @@ -5460,6 +5484,8 @@ EXPORT_SYMBOL(yield); * yield_to - yield the current processor to another thread in * your thread group, or accelerate that thread toward the * processor it's on. + * @p: target task + * @preempt: whether task preemption is allowed or not * * It's the caller's job to ensure that the target task struct * can't go away on us before we can do any checks. @@ -5525,6 +5551,7 @@ void __sched io_schedule(void) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + blk_flush_plug(current); current->in_iowait = 1; schedule(); current->in_iowait = 0; @@ -5540,6 +5567,7 @@ long __sched io_schedule_timeout(long timeout) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + blk_flush_plug(current); current->in_iowait = 1; ret = schedule_timeout(timeout); current->in_iowait = 0; @@ -5688,7 +5716,7 @@ void show_state_filter(unsigned long state_filter) do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow - * console might take alot of time: + * console might take a lot of time: */ touch_nmi_watchdog(); if (!state_filter || (p->state & state_filter)) @@ -6303,6 +6331,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #endif } + + update_max_interval(); + return NOTIFY_OK; } @@ -8434,7 +8465,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct cfs_rq *cfs_rq; struct sched_entity *se; - struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8447,8 +8477,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 5946ac5..429242f 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -179,7 +179,7 @@ void sched_autogroup_create_attach(struct task_struct *p) struct autogroup *ag = autogroup_create(); autogroup_move_group(p, ag); - /* drop extra refrence added by autogroup_create() */ + /* drop extra reference added by autogroup_create() */ autogroup_kref_put(ag); } EXPORT_SYMBOL(sched_autogroup_create_attach); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3f7ec9e..6fa833a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -22,6 +22,7 @@ #include <linux/latencytop.h> #include <linux/sched.h> +#include <linux/cpumask.h> /* * Targeted preemption latency for CPU-bound tasks: @@ -2103,21 +2104,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, struct cfs_rq *busiest_cfs_rq) { - int loops = 0, pulled = 0, pinned = 0; + int loops = 0, pulled = 0; long rem_load_move = max_load_move; struct task_struct *p, *n; if (max_load_move == 0) goto out; - pinned = 1; - list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { if (loops++ > sysctl_sched_nr_migrate) break; if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) + !can_migrate_task(p, busiest, this_cpu, sd, idle, + all_pinned)) continue; pull_task(busiest, p, this_rq, this_cpu); @@ -2152,9 +2152,6 @@ out: */ schedstat_add(sd, lb_gained[idle], pulled); - if (all_pinned) - *all_pinned = pinned; - return max_load_move - rem_load_move; } @@ -3061,7 +3058,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, /* * if *imbalance is less than the average load per runnable task - * there is no gaurantee that any tasks will be moved so we'll have + * there is no guarantee that any tasks will be moved so we'll have * a think about bumping its value to force at least one task to be * moved */ @@ -3126,6 +3123,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; + sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; + /* * If the busiest group is imbalanced the below checks don't * work because they assumes all things are equal, which typically @@ -3150,7 +3149,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * Don't pull any tasks if this group is already above the domain * average load. */ - sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; if (sds.this_load >= sds.avg_load) goto out_balanced; @@ -3339,6 +3337,7 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ + all_pinned = 1; local_irq_save(flags); double_rq_lock(this_rq, busiest); ld_moved = move_tasks(this_rq, this_cpu, busiest, @@ -3819,6 +3818,17 @@ void select_nohz_load_balancer(int stop_tick) static DEFINE_SPINLOCK(balancing); +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + +/* + * Scale the max load_balance interval with the number of CPUs in the system. + * This trades load-balance latency on larger machines for less cross talk. + */ +static void update_max_interval(void) +{ + max_load_balance_interval = HZ*num_online_cpus()/10; +} + /* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. @@ -3848,10 +3858,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); - if (unlikely(!interval)) - interval = 1; - if (interval > HZ*NR_CPUS/10) - interval = HZ*NR_CPUS/10; + interval = clamp(interval, 1UL, max_load_balance_interval); need_serialize = sd->flags & SD_SERIALIZE; diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index c82f26c1..a776a63 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -94,6 +94,4 @@ static const struct sched_class idle_sched_class = { .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, - - /* no .task_new for idle tasks */ }; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index db308cb..e7cebdc 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1378,7 +1378,7 @@ retry: task = pick_next_pushable_task(rq); if (task_cpu(next_task) == rq->cpu && task == next_task) { /* - * If we get here, the task hasnt moved at all, but + * If we get here, the task hasn't moved at all, but * it has failed to push. We will not try again, * since the other cpus will pull from us when they * are ready. @@ -1488,7 +1488,7 @@ static int pull_rt_task(struct rq *this_rq) /* * We continue with the search, just in * case there's an even higher prio task - * in another runqueue. (low likelyhood + * in another runqueue. (low likelihood * but possible) */ } diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 84ec9bc..1ba2bd4 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -102,6 +102,4 @@ static const struct sched_class stop_sched_class = { .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, - - /* no .task_new for stop tasks */ }; diff --git a/kernel/signal.c b/kernel/signal.c index 4e3cff1..7165af5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -226,7 +226,7 @@ static inline void print_dropped_signal(int sig) /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an - * appopriate lock must be held to stop the target task from exiting + * appropriate lock must be held to stop the target task from exiting */ static struct sigqueue * __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) @@ -375,15 +375,15 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tracehook_consider_fatal_signal(tsk, sig); } - -/* Notify the system that a driver wants to block all signals for this +/* + * Notify the system that a driver wants to block all signals for this * process, and wants to be notified if any signals at all were to be * sent/acted upon. If the notifier routine returns non-zero, then the * signal will be acted upon after all. If the notifier routine returns 0, * then then signal will be blocked. Only one block per process is * allowed. priv is a pointer to private data that the notifier routine - * can use to determine if the signal should be blocked or not. */ - + * can use to determine if the signal should be blocked or not. + */ void block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) { @@ -434,9 +434,10 @@ still_pending: copy_siginfo(info, &first->info); __sigqueue_free(first); } else { - /* Ok, it wasn't in the queue. This must be - a fast-pathed signal or we must have been - out of queue space. So zero out the info. + /* + * Ok, it wasn't in the queue. This must be + * a fast-pathed signal or we must have been + * out of queue space. So zero out the info. */ info->si_signo = sig; info->si_errno = 0; @@ -468,7 +469,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, } /* - * Dequeue a signal and return the element to the caller, which is + * Dequeue a signal and return the element to the caller, which is * expected to free it. * * All callers have to hold the siglock. @@ -490,7 +491,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * itimers are process shared and we restart periodic * itimers in the signal delivery path to prevent DoS * attacks in the high resolution timer case. This is - * compliant with the old way of self restarting + * compliant with the old way of self-restarting * itimers, as the SIGALRM is a legacy signal and only * queued once. Changing the restart behaviour to * restart the timer in the signal dequeue path is @@ -636,13 +637,33 @@ static inline bool si_fromuser(const struct siginfo *info) } /* + * called with RCU read lock from check_kill_permission() + */ +static int kill_ok_by_cred(struct task_struct *t) +{ + const struct cred *cred = current_cred(); + const struct cred *tcred = __task_cred(t); + + if (cred->user->user_ns == tcred->user->user_ns && + (cred->euid == tcred->suid || + cred->euid == tcred->uid || + cred->uid == tcred->suid || + cred->uid == tcred->uid)) + return 1; + + if (ns_capable(tcred->user->user_ns, CAP_KILL)) + return 1; + + return 0; +} + +/* * Bad permissions for sending the signal * - the caller must hold the RCU read lock */ static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) { - const struct cred *cred, *tcred; struct pid *sid; int error; @@ -656,14 +677,8 @@ static int check_kill_permission(int sig, struct siginfo *info, if (error) return error; - cred = current_cred(); - tcred = __task_cred(t); if (!same_thread_group(current, t) && - (cred->euid ^ tcred->suid) && - (cred->euid ^ tcred->uid) && - (cred->uid ^ tcred->suid) && - (cred->uid ^ tcred->uid) && - !capable(CAP_KILL)) { + !kill_ok_by_cred(t)) { switch (sig) { case SIGCONT: sid = task_session(t); @@ -909,14 +924,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, if (info == SEND_SIG_FORCED) goto out_set; - /* Real-time signals must be queued if sent by sigqueue, or - some other real-time mechanism. It is implementation - defined whether kill() does so. We attempt to do so, on - the principle of least surprise, but since kill is not - allowed to fail with EAGAIN when low on memory we just - make sure at least one signal gets delivered and don't - pass on the info struct. */ - + /* + * Real-time signals must be queued if sent by sigqueue, or + * some other real-time mechanism. It is implementation + * defined whether kill() does so. We attempt to do so, on + * the principle of least surprise, but since kill is not + * allowed to fail with EAGAIN when low on memory we just + * make sure at least one signal gets delivered and don't + * pass on the info struct. + */ if (sig < SIGRTMIN) override_rlimit = (is_si_special(info) || info->si_code >= 0); else @@ -1187,8 +1203,7 @@ retry: return error; } -int -kill_proc_info(int sig, struct siginfo *info, pid_t pid) +int kill_proc_info(int sig, struct siginfo *info, pid_t pid) { int error; rcu_read_lock(); @@ -1285,8 +1300,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) * These are for backward compatibility with the rest of the kernel source. */ -int -send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { /* * Make sure legacy kernel users don't send in bad values @@ -1354,7 +1368,7 @@ EXPORT_SYMBOL(kill_pid); * These functions support sending signals using preallocated sigqueue * structures. This is needed "because realtime applications cannot * afford to lose notifications of asynchronous events, like timer - * expirations or I/O completions". In the case of Posix Timers + * expirations or I/O completions". In the case of POSIX Timers * we allocate the sigqueue structure from the timer_create. If this * allocation fails we are able to report the failure to the application * with an EAGAIN error. @@ -1539,7 +1553,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) info.si_signo = SIGCHLD; info.si_errno = 0; /* - * see comment in do_notify_parent() abot the following 3 lines + * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); @@ -1597,7 +1611,7 @@ static inline int may_ptrace_stop(void) } /* - * Return nonzero if there is a SIGKILL that should be waking us up. + * Return non-zero if there is a SIGKILL that should be waking us up. * Called with the siglock held. */ static int sigkill_pending(struct task_struct *tsk) @@ -1721,7 +1735,7 @@ void ptrace_notify(int exit_code) /* * This performs the stopping for SIGSTOP and other stop signals. * We have to stop all threads in the thread group. - * Returns nonzero if we've actually stopped and released the siglock. + * Returns non-zero if we've actually stopped and released the siglock. * Returns zero if we didn't stop and still hold the siglock. */ static int do_signal_stop(int signr) @@ -1809,10 +1823,12 @@ static int ptrace_signal(int signr, siginfo_t *info, current->exit_code = 0; - /* Update the siginfo structure if the signal has - changed. If the debugger wanted something - specific in the siginfo structure then it should - have updated *info via PTRACE_SETSIGINFO. */ + /* + * Update the siginfo structure if the signal has + * changed. If the debugger wanted something + * specific in the siginfo structure then it should + * have updated *info via PTRACE_SETSIGINFO. + */ if (signr != info->si_signo) { info->si_signo = signr; info->si_errno = 0; @@ -1871,7 +1887,7 @@ relock: for (;;) { struct k_sigaction *ka; /* - * Tracing can induce an artifical signal and choose sigaction. + * Tracing can induce an artificial signal and choose sigaction. * The return value in @signr determines the default action, * but @info->si_signo is the signal number we will report. */ @@ -2020,7 +2036,8 @@ void exit_signals(struct task_struct *tsk) if (!signal_pending(tsk)) goto out; - /* It could be that __group_complete_signal() choose us to + /* + * It could be that __group_complete_signal() choose us to * notify about group-wide signal. Another thread should be * woken now to take the signal since we will not. */ @@ -2058,6 +2075,9 @@ EXPORT_SYMBOL(unblock_all_signals); * System call entry points. */ +/** + * sys_restart_syscall - restart a system call + */ SYSCALL_DEFINE0(restart_syscall) { struct restart_block *restart = ¤t_thread_info()->restart_block; @@ -2111,6 +2131,13 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) return error; } +/** + * sys_rt_sigprocmask - change the list of currently blocked signals + * @how: whether to add, remove, or set signals + * @set: stores pending signals + * @oset: previous value of signal mask if non-null + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, sigset_t __user *, oset, size_t, sigsetsize) { @@ -2169,8 +2196,14 @@ long do_sigpending(void __user *set, unsigned long sigsetsize) out: return error; -} +} +/** + * sys_rt_sigpending - examine a pending signal that has been raised + * while blocked + * @set: stores pending signals + * @sigsetsize: size of sigset_t type or larger + */ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) { return do_sigpending(set, sigsetsize); @@ -2219,9 +2252,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) err |= __put_user(from->si_trapno, &to->si_trapno); #endif #ifdef BUS_MCEERR_AO - /* + /* * Other callers might not initialize the si_lsb field, - * so check explicitely for the right codes here. + * so check explicitly for the right codes here. */ if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); @@ -2250,6 +2283,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) #endif +/** + * sys_rt_sigtimedwait - synchronously wait for queued signals specified + * in @uthese + * @uthese: queued signals to wait for + * @uinfo: if non-null, the signal's siginfo is returned here + * @uts: upper bound on process time suspension + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, siginfo_t __user *, uinfo, const struct timespec __user *, uts, size_t, sigsetsize) @@ -2266,7 +2307,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, if (copy_from_user(&these, uthese, sizeof(these))) return -EFAULT; - + /* * Invert the set of allowed signals to get those we * want to block. @@ -2291,9 +2332,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, + (ts.tv_sec || ts.tv_nsec)); if (timeout) { - /* None ready -- temporarily unblock those we're + /* + * None ready -- temporarily unblock those we're * interested while we are sleeping in so that we'll - * be awakened when they arrive. */ + * be awakened when they arrive. + */ current->real_blocked = current->blocked; sigandsets(¤t->blocked, ¤t->blocked, &these); recalc_sigpending(); @@ -2325,6 +2368,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, return ret; } +/** + * sys_kill - send a signal to a process + * @pid: the PID of the process + * @sig: signal to be sent + */ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct siginfo info; @@ -2400,7 +2448,11 @@ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) return do_tkill(tgid, pid, sig); } -/* +/** + * sys_tkill - send signal to one specific task + * @pid: the PID of the task + * @sig: signal to be sent + * * Send a signal to only one task, even if it's a CLONE_THREAD task. */ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) @@ -2412,6 +2464,12 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) return do_tkill(0, pid, sig); } +/** + * sys_rt_sigqueueinfo - send signal information to a signal + * @pid: the PID of the thread + * @sig: signal to be sent + * @uinfo: signal info to be sent + */ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { @@ -2421,9 +2479,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, return -EFAULT; /* Not even root can pretend to send signals from the kernel. - Nor can they impersonate a kill(), which adds source info. */ - if (info.si_code >= 0) + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if (info.si_code >= 0 || info.si_code == SI_TKILL) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info.si_code < 0); return -EPERM; + } info.si_signo = sig; /* POSIX.1b doesn't mention process groups. */ @@ -2437,9 +2499,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) return -EINVAL; /* Not even root can pretend to send signals from the kernel. - Nor can they impersonate a kill(), which adds source info. */ - if (info->si_code >= 0) + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if (info->si_code >= 0 || info->si_code == SI_TKILL) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info->si_code < 0); return -EPERM; + } info->si_signo = sig; return do_send_specific(tgid, pid, sig, info); @@ -2531,12 +2597,11 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s error = -EINVAL; /* - * - * Note - this code used to test ss_flags incorrectly + * Note - this code used to test ss_flags incorrectly: * old code may have been written using ss_flags==0 * to mean ss_flags==SS_ONSTACK (as this was the only * way that worked) - this fix preserves that older - * mechanism + * mechanism. */ if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) goto out; @@ -2570,6 +2635,10 @@ out: #ifdef __ARCH_WANT_SYS_SIGPENDING +/** + * sys_sigpending - examine pending signals + * @set: where mask of pending signal is returned + */ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) { return do_sigpending(set, sizeof(*set)); @@ -2578,8 +2647,15 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) #endif #ifdef __ARCH_WANT_SYS_SIGPROCMASK -/* Some platforms have their own version with special arguments others - support only sys_rt_sigprocmask. */ +/** + * sys_sigprocmask - examine and change blocked signals + * @how: whether to add, remove, or set signals + * @set: signals to add or remove (if non-null) + * @oset: previous value of signal mask if non-null + * + * Some platforms have their own version with special arguments; + * others support only sys_rt_sigprocmask. + */ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, old_sigset_t __user *, oset) @@ -2632,6 +2708,13 @@ out: #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ #ifdef __ARCH_WANT_SYS_RT_SIGACTION +/** + * sys_rt_sigaction - alter an action taken by a process + * @sig: signal to be sent + * @act: new sigaction + * @oact: used to save the previous sigaction + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE4(rt_sigaction, int, sig, const struct sigaction __user *, act, struct sigaction __user *, oact, @@ -2718,6 +2801,12 @@ SYSCALL_DEFINE0(pause) #endif #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND +/** + * sys_rt_sigsuspend - replace the signal mask for a value with the + * @unewset value until a signal is received + * @unewset: new signal mask value + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) { sigset_t newset; diff --git a/kernel/smp.c b/kernel/smp.c index 7cbd0f2..73a1951 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -604,6 +604,87 @@ void ipi_call_unlock_irq(void) } #endif /* USE_GENERIC_SMP_HELPERS */ +/* Setup configured maximum number of CPUs to activate */ +unsigned int setup_max_cpus = NR_CPUS; +EXPORT_SYMBOL(setup_max_cpus); + + +/* + * Setup routine for controlling SMP activation + * + * Command-line option of "nosmp" or "maxcpus=0" will disable SMP + * activation entirely (the MPS table probe still happens, though). + * + * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer + * greater than 0, limits the maximum number of CPUs activated in + * SMP mode to <NUM>. + */ + +void __weak arch_disable_smp_support(void) { } + +static int __init nosmp(char *str) +{ + setup_max_cpus = 0; + arch_disable_smp_support(); + + return 0; +} + +early_param("nosmp", nosmp); + +/* this is hard limit */ +static int __init nrcpus(char *str) +{ + int nr_cpus; + + get_option(&str, &nr_cpus); + if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) + nr_cpu_ids = nr_cpus; + + return 0; +} + +early_param("nr_cpus", nrcpus); + +static int __init maxcpus(char *str) +{ + get_option(&str, &setup_max_cpus); + if (setup_max_cpus == 0) + arch_disable_smp_support(); + + return 0; +} + +early_param("maxcpus", maxcpus); + +/* Setup number of possible processor ids */ +int nr_cpu_ids __read_mostly = NR_CPUS; +EXPORT_SYMBOL(nr_cpu_ids); + +/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ +void __init setup_nr_cpu_ids(void) +{ + nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; +} + +/* Called by boot processor to activate the rest. */ +void __init smp_init(void) +{ + unsigned int cpu; + + /* FIXME: This should be done in userspace --RR */ + for_each_present_cpu(cpu) { + if (num_online_cpus() >= setup_max_cpus) + break; + if (!cpu_online(cpu)) + cpu_up(cpu); + } + + /* Any cleanup work */ + printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); + smp_cpus_done(setup_max_cpus); +} + /* * Call a function on all processors. May be used during early boot while * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead diff --git a/kernel/softirq.c b/kernel/softirq.c index 56e5dec..174f976 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -567,7 +567,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data) /** * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks * @ttimer: tasklet_hrtimer which is initialized - * @function: hrtimer callback funtion which gets called from softirq context + * @function: hrtimer callback function which gets called from softirq context * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) */ @@ -845,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); + p = kthread_create_on_node(run_ksoftirqd, + hcpu, + cpu_to_node(hotcpu), + "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); return notifier_from_errno(PTR_ERR(p)); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2df820b..e3516b2 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, case CPU_UP_PREPARE: BUG_ON(stopper->thread || stopper->enabled || !list_empty(&stopper->works)); - p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", - cpu); + p = kthread_create_on_node(cpu_stopper_thread, + stopper, + cpu_to_node(cpu), + "migration/%d", cpu); if (IS_ERR(p)) return notifier_from_errno(PTR_ERR(p)); get_task_struct(p); diff --git a/kernel/sys.c b/kernel/sys.c index 1ad48b3..af468ed 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -120,16 +120,33 @@ EXPORT_SYMBOL(cad_pid); void (*pm_power_off_prepare)(void); /* + * Returns true if current's euid is same as p's uid or euid, + * or has CAP_SYS_NICE to p's user_ns. + * + * Called with rcu_read_lock, creds are safe + */ +static bool set_one_prio_perm(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred = __task_cred(p); + + if (pcred->user->user_ns == cred->user->user_ns && + (pcred->uid == cred->euid || + pcred->euid == cred->euid)) + return true; + if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) + return true; + return false; +} + +/* * set the priority of a task * - the caller must hold the RCU read lock */ static int set_one_prio(struct task_struct *p, int niceval, int error) { - const struct cred *cred = current_cred(), *pcred = __task_cred(p); int no_nice; - if (pcred->uid != cred->euid && - pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) { + if (!set_one_prio_perm(p)) { error = -EPERM; goto out; } @@ -506,7 +523,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) if (rgid != (gid_t) -1) { if (old->gid == rgid || old->egid == rgid || - capable(CAP_SETGID)) + nsown_capable(CAP_SETGID)) new->gid = rgid; else goto error; @@ -515,7 +532,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) if (old->gid == egid || old->egid == egid || old->sgid == egid || - capable(CAP_SETGID)) + nsown_capable(CAP_SETGID)) new->egid = egid; else goto error; @@ -550,7 +567,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) old = current_cred(); retval = -EPERM; - if (capable(CAP_SETGID)) + if (nsown_capable(CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = gid; else if (gid == old->gid || gid == old->sgid) new->egid = new->fsgid = gid; @@ -617,7 +634,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) new->uid = ruid; if (old->uid != ruid && old->euid != ruid && - !capable(CAP_SETUID)) + !nsown_capable(CAP_SETUID)) goto error; } @@ -626,7 +643,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) if (old->uid != euid && old->euid != euid && old->suid != euid && - !capable(CAP_SETUID)) + !nsown_capable(CAP_SETUID)) goto error; } @@ -674,7 +691,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) old = current_cred(); retval = -EPERM; - if (capable(CAP_SETUID)) { + if (nsown_capable(CAP_SETUID)) { new->suid = new->uid = uid; if (uid != old->uid) { retval = set_user(new); @@ -716,7 +733,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) old = current_cred(); retval = -EPERM; - if (!capable(CAP_SETUID)) { + if (!nsown_capable(CAP_SETUID)) { if (ruid != (uid_t) -1 && ruid != old->uid && ruid != old->euid && ruid != old->suid) goto error; @@ -780,7 +797,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) old = current_cred(); retval = -EPERM; - if (!capable(CAP_SETGID)) { + if (!nsown_capable(CAP_SETGID)) { if (rgid != (gid_t) -1 && rgid != old->gid && rgid != old->egid && rgid != old->sgid) goto error; @@ -840,7 +857,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) if (uid == old->uid || uid == old->euid || uid == old->suid || uid == old->fsuid || - capable(CAP_SETUID)) { + nsown_capable(CAP_SETUID)) { if (uid != old_fsuid) { new->fsuid = uid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) @@ -873,7 +890,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) if (gid == old->gid || gid == old->egid || gid == old->sgid || gid == old->fsgid || - capable(CAP_SETGID)) { + nsown_capable(CAP_SETGID)) { if (gid != old_fsgid) { new->fsgid = gid; goto change_okay; @@ -1181,8 +1198,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; down_write(&uts_sem); @@ -1230,7 +1248,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; @@ -1345,6 +1363,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, rlim = tsk->signal->rlim + resource; task_lock(tsk->group_leader); if (new_rlim) { + /* Keep the capable check against init_user_ns until + cgroups can contain all limits */ if (new_rlim->rlim_max > rlim->rlim_max && !capable(CAP_SYS_RESOURCE)) retval = -EPERM; @@ -1388,19 +1408,22 @@ static int check_prlimit_permission(struct task_struct *task) { const struct cred *cred = current_cred(), *tcred; - tcred = __task_cred(task); - if (current != task && - (cred->uid != tcred->euid || - cred->uid != tcred->suid || - cred->uid != tcred->uid || - cred->gid != tcred->egid || - cred->gid != tcred->sgid || - cred->gid != tcred->gid) && - !capable(CAP_SYS_RESOURCE)) { - return -EPERM; - } + if (current == task) + return 0; - return 0; + tcred = __task_cred(task); + if (cred->user->user_ns == tcred->user->user_ns && + (cred->uid == tcred->euid && + cred->uid == tcred->suid && + cred->uid == tcred->uid && + cred->gid == tcred->egid && + cred->gid == tcred->sgid && + cred->gid == tcred->gid)) + return 0; + if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) + return 0; + + return -EPERM; } SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 40245d69..c0bb324 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -117,6 +117,7 @@ static int neg_one = -1; static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; +static int __maybe_unused three = 3; static unsigned long one_ul = 1; static int one_hundred = 100; #ifdef CONFIG_PRINTK @@ -169,6 +170,11 @@ static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_PRINTK +static int proc_dmesg_restrict(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif + #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; @@ -706,7 +712,7 @@ static struct ctl_table kern_table[] = { .data = &kptr_restrict, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dmesg_restrict, .extra1 = &zero, .extra2 = &two, }, @@ -971,14 +977,18 @@ static struct ctl_table vm_table[] = { .data = &sysctl_overcommit_memory, .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "panic_on_oom", .data = &sysctl_panic_on_oom, .maxlen = sizeof(sysctl_panic_on_oom), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "oom_kill_allocating_task", @@ -1006,7 +1016,8 @@ static struct ctl_table vm_table[] = { .data = &page_cluster, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "dirty_background_ratio", @@ -1054,7 +1065,8 @@ static struct ctl_table vm_table[] = { .data = &dirty_expire_interval, .maxlen = sizeof(dirty_expire_interval), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "nr_pdflush_threads", @@ -1130,6 +1142,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = drop_caches_sysctl_handler, + .extra1 = &one, + .extra2 = &three, }, #ifdef CONFIG_COMPACTION { @@ -2385,6 +2399,17 @@ static int proc_taint(struct ctl_table *table, int write, return err; } +#ifdef CONFIG_PRINTK +static int proc_dmesg_restrict(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + struct do_proc_dointvec_minmax_conv_param { int *min; int *max; diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 10b90d8..4e4932a 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) const char *fail = NULL; if (table->parent) { - if (table->procname && !table->parent->procname) + if (!table->parent->procname) set_fail(&fail, table, "Parent without procname"); } - if (!table->procname) - set_fail(&fail, table, "No procname"); if (table->child) { if (table->data) set_fail(&fail, table, "Directory with data?"); @@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) set_fail(&fail, table, "No maxlen"); } #ifdef CONFIG_PROC_SYSCTL - if (table->procname && !table->proc_handler) + if (!table->proc_handler) set_fail(&fail, table, "No proc_handler"); #endif -#if 0 - if (!table->procname && table->proc_handler) - set_fail(&fail, table, "proc_handler without procname"); -#endif sysctl_check_leaf(namespaces, table, &fail); } if (table->mode > 0777) diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 3971c6b..9ffea36 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -685,7 +685,7 @@ static int __init taskstats_init(void) goto err_cgroup_ops; family_registered = 1; - printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); + pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); return 0; err_cgroup_ops: genl_unregister_ops(&family, &taskstats_ops); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index b2fa506..a470154 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -34,7 +34,7 @@ * inaccuracies caused by missed or lost timer * interrupts and the inability for the timer * interrupt hardware to accuratly tick at the - * requested HZ value. It is also not reccomended + * requested HZ value. It is also not recommended * for "tick-less" systems. */ #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5f1bb8e..f6117a4 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -652,6 +652,8 @@ int do_adjtimex(struct timex *txc) struct timespec delta; delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; + if (!capable(CAP_SYS_TIME)) + return -EPERM; if (!(txc->modes & ADJ_NANO)) delta.tv_nsec *= 1000; result = timekeeping_inject_offset(&delta); diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 25028dd..c340ca6 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -19,7 +19,6 @@ */ #include <linux/device.h> #include <linux/file.h> -#include <linux/mutex.h> #include <linux/posix-clock.h> #include <linux/slab.h> #include <linux/syscalls.h> @@ -34,19 +33,19 @@ static struct posix_clock *get_posix_clock(struct file *fp) { struct posix_clock *clk = fp->private_data; - mutex_lock(&clk->mutex); + down_read(&clk->rwsem); if (!clk->zombie) return clk; - mutex_unlock(&clk->mutex); + up_read(&clk->rwsem); return NULL; } static void put_posix_clock(struct posix_clock *clk) { - mutex_unlock(&clk->mutex); + up_read(&clk->rwsem); } static ssize_t posix_clock_read(struct file *fp, char __user *buf, @@ -156,7 +155,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) struct posix_clock *clk = container_of(inode->i_cdev, struct posix_clock, cdev); - mutex_lock(&clk->mutex); + down_read(&clk->rwsem); if (clk->zombie) { err = -ENODEV; @@ -172,7 +171,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) fp->private_data = clk; } out: - mutex_unlock(&clk->mutex); + up_read(&clk->rwsem); return err; } @@ -211,25 +210,20 @@ int posix_clock_register(struct posix_clock *clk, dev_t devid) int err; kref_init(&clk->kref); - mutex_init(&clk->mutex); + init_rwsem(&clk->rwsem); cdev_init(&clk->cdev, &posix_clock_file_operations); clk->cdev.owner = clk->ops.owner; err = cdev_add(&clk->cdev, devid, 1); - if (err) - goto no_cdev; return err; -no_cdev: - mutex_destroy(&clk->mutex); - return err; } EXPORT_SYMBOL_GPL(posix_clock_register); static void delete_clock(struct kref *kref) { struct posix_clock *clk = container_of(kref, struct posix_clock, kref); - mutex_destroy(&clk->mutex); + if (clk->release) clk->release(clk); } @@ -238,9 +232,9 @@ void posix_clock_unregister(struct posix_clock *clk) { cdev_del(&clk->cdev); - mutex_lock(&clk->mutex); + down_write(&clk->rwsem); clk->zombie = true; - mutex_unlock(&clk->mutex); + up_write(&clk->rwsem); kref_put(&clk->kref, delete_clock); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3bd7e3d..8ad5d57 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -14,7 +14,7 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/sched.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/clocksource.h> #include <linux/jiffies.h> #include <linux/time.h> @@ -597,13 +597,12 @@ static struct timespec timekeeping_suspend_time; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. - * @dev: unused * * This is for the generic clocksource timekeeping. * xtime/wall_to_monotonic/jiffies/etc are * still managed by arch specific suspend/resume code. */ -static int timekeeping_resume(struct sys_device *dev) +static void timekeeping_resume(void) { unsigned long flags; struct timespec ts; @@ -632,11 +631,9 @@ static int timekeeping_resume(struct sys_device *dev) /* Resume hrtimers */ hres_timers_resume(); - - return 0; } -static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) +static int timekeeping_suspend(void) { unsigned long flags; @@ -654,26 +651,18 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) } /* sysfs resume/suspend bits for timekeeping */ -static struct sysdev_class timekeeping_sysclass = { - .name = "timekeeping", +static struct syscore_ops timekeeping_syscore_ops = { .resume = timekeeping_resume, .suspend = timekeeping_suspend, }; -static struct sys_device device_timer = { - .id = 0, - .cls = &timekeeping_sysclass, -}; - -static int __init timekeeping_init_device(void) +static int __init timekeeping_init_ops(void) { - int error = sysdev_class_register(&timekeeping_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; + register_syscore_ops(&timekeeping_syscore_ops); + return 0; } -device_initcall(timekeeping_init_device); +device_initcall(timekeeping_init_ops); /* * If the error is already larger, we look ahead even further diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 2f3b585..a5d0a3a 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, unsigned int timer_flag) { /* - * It doesnt matter which lock we take: + * It doesn't matter which lock we take: */ raw_spinlock_t *lock; struct entry *entry, input; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index cbafed7..6957aa2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q) * **/ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, - u32 what) + u32 what) { struct blk_trace *bt = q->blk_trace; - int rw = rq->cmd_flags & 0x03; if (likely(!bt)) return; - if (rq->cmd_flags & REQ_DISCARD) - rw |= REQ_DISCARD; - - if (rq->cmd_flags & REQ_SECURE) - rw |= REQ_SECURE; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, + __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags, what, rq->errors, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, - what, rq->errors, 0, NULL); + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq->cmd_flags, what, rq->errors, 0, NULL); } } @@ -857,29 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); } -static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) +static void blk_add_trace_unplug(void *ignore, struct request_queue *q, + unsigned int depth, bool explicit) { struct blk_trace *bt = q->blk_trace; if (bt) { - unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; - __be64 rpdu = cpu_to_be64(pdu); - - __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, - sizeof(rpdu), &rpdu); - } -} + __be64 rpdu = cpu_to_be64(depth); + u32 what; -static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) { - unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; - __be64 rpdu = cpu_to_be64(pdu); + if (explicit) + what = BLK_TA_UNPLUG_IO; + else + what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, - sizeof(rpdu), &rpdu); + __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); } } @@ -1022,9 +1007,7 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); - ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); - WARN_ON(ret); - ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); + ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); WARN_ON(ret); ret = register_trace_block_split(blk_add_trace_split, NULL); WARN_ON(ret); @@ -1039,8 +1022,7 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); unregister_trace_block_split(blk_add_trace_split, NULL); - unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); - unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); + unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 888b611..ee24fa1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1268,7 +1268,7 @@ static int ftrace_update_code(struct module *mod) p->flags = 0L; /* - * Do the initial record convertion from mcount jump + * Do the initial record conversion from mcount jump * to the NOP instructions. */ if (!ftrace_code_disable(mod, p)) { @@ -1467,7 +1467,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) return t_hash_next(m, pos); (*pos)++; - iter->pos = *pos; + iter->pos = iter->func_pos = *pos; if (iter->flags & FTRACE_ITER_PRINTALL) return t_hash_start(m, pos); @@ -1502,7 +1502,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (!rec) return t_hash_start(m, pos); - iter->func_pos = *pos; iter->func = rec; return iter; @@ -3426,7 +3425,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->ftrace_timestamp = 0; - /* make curr_ret_stack visable before we add the ret_stack */ + /* make curr_ret_stack visible before we add the ret_stack */ smp_wmb(); t->ret_stack = ret_stack; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d9c8bca..0ef7b4b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1478,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage) return local_read(&bpage->entries) & RB_WRITE_MASK; } -/* Size is determined by what has been commited */ +/* Size is determined by what has been committed */ static inline unsigned rb_page_size(struct buffer_page *bpage) { return rb_page_commit(bpage); @@ -2932,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) /* * cpu_buffer->pages just needs to point to the buffer, it * has no specific buffer page to point to. Lets move it out - * of our way so we don't accidently swap it. + * of our way so we don't accidentally swap it. */ cpu_buffer->pages = reader->list.prev; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9541c27..d38c16a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3239,7 +3239,7 @@ waitagain: trace_seq_init(&iter->seq); /* - * If there was nothing to send to user, inspite of consuming trace + * If there was nothing to send to user, in spite of consuming trace * entries, go back to wait for more entries. */ if (sret == -EBUSY) diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 685a67d..6302747 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void) } /* - * trace_clock(): 'inbetween' trace clock. Not completely serialized, + * trace_clock(): 'between' trace clock. Not completely serialized, * but not completely incorrect when crossing CPUs either. * * This is based on cpu_clock(), which will allow at most ~1 jiffy of diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 1516cb3..e32744c 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -27,7 +27,7 @@ * in the structure. * * * for structures within structures, the format of the internal - * structure is layed out. This allows the internal structure + * structure is laid out. This allows the internal structure * to be deciphered for the format file. Although these macros * may become out of sync with the internal structure, they * will create a compile error if it happens. Since the diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 76b0598..962cdb2 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -905,7 +905,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, * * returns 1 if * - we are inside irq code - * - we just extered irq code + * - we just entered irq code * * retunns 0 if * - funcgraph-interrupts option is set diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 92b6e1e..a4969b4 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -80,7 +80,7 @@ static struct tracer_flags tracer_flags = { * skip the latency if the sequence has changed - some other section * did a maximum and could disturb our measurement with serial console * printouts, etc. Truly coinciding maximum latencies should be rare - * and what happens together happens separately as well, so this doesnt + * and what happens together happens separately as well, so this doesn't * decrease the validity of the maximum found: */ static __cacheline_aligned_in_smp unsigned long max_sequence; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8435b43..35d55a3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1839,7 +1839,7 @@ static void unregister_probe_event(struct trace_probe *tp) kfree(tp->call.print_fmt); } -/* Make a debugfs interface for controling probe points */ +/* Make a debugfs interface for controlling probe points */ static __init int init_kprobe_trace(void) { struct dentry *d_tracer; diff --git a/kernel/uid16.c b/kernel/uid16.c index 4192098..51c6e89 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!capable(CAP_SETGID)) + if (!nsown_capable(CAP_SETGID)) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index eb27fd3..92cb706 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c @@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register); /* * Removes a registered user return notifier. Must be called from atomic - * context, and from the same cpu registration occured in. + * context, and from the same cpu registration occurred in. */ void user_return_notifier_unregister(struct user_return_notifier *urn) { diff --git a/kernel/user.c b/kernel/user.c index 5c598ca..9e03e9c 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -17,9 +17,13 @@ #include <linux/module.h> #include <linux/user_namespace.h> +/* + * userns count is 1 for root user, 1 for init_uts_ns, + * and 1 for... ? + */ struct user_namespace init_user_ns = { .kref = { - .refcount = ATOMIC_INIT(2), + .refcount = ATOMIC_INIT(3), }, .creator = &root_user, }; @@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep; */ static DEFINE_SPINLOCK(uidhash_lock); -/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */ +/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ struct user_struct root_user = { .__count = ATOMIC_INIT(2), .processes = ATOMIC_INIT(1), diff --git a/kernel/utsname.c b/kernel/utsname.c index 8a82b4b..4464617 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -14,6 +14,7 @@ #include <linux/utsname.h> #include <linux/err.h> #include <linux/slab.h> +#include <linux/user_namespace.h> static struct uts_namespace *create_uts_ns(void) { @@ -30,7 +31,8 @@ static struct uts_namespace *create_uts_ns(void) * @old_ns: namespace to clone * Return NULL on error (failure to kmalloc), new ns otherwise */ -static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) +static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, + struct uts_namespace *old_ns) { struct uts_namespace *ns; @@ -40,6 +42,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); up_read(&uts_sem); return ns; } @@ -50,8 +53,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) +struct uts_namespace *copy_utsname(unsigned long flags, + struct task_struct *tsk) { + struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; struct uts_namespace *new_ns; BUG_ON(!old_ns); @@ -60,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol if (!(flags & CLONE_NEWUTS)) return old_ns; - new_ns = clone_uts_ns(old_ns); + new_ns = clone_uts_ns(tsk, old_ns); put_uts_ns(old_ns); return new_ns; @@ -71,5 +76,6 @@ void free_uts_ns(struct kref *kref) struct uts_namespace *ns; ns = container_of(kref, struct uts_namespace, kref); + put_user_ns(ns->user_ns); kfree(ns); } diff --git a/kernel/wait.c b/kernel/wait.c index b0310eb..f45ea8d 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait); * woken up through the queue. * * This prevents waiter starvation where an exclusive waiter - * aborts and is woken up concurrently and noone wakes up + * aborts and is woken up concurrently and no one wakes up * the next waiter. */ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 18bb157..140dce7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); * Should we panic when a soft-lockup or hard-lockup occurs: */ #ifdef CONFIG_HARDLOCKUP_DETECTOR -static int hardlockup_panic; +static int hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; static int __init hardlockup_panic_setup(char *str) { if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) watchdog_enabled = 0; return 1; @@ -415,19 +418,22 @@ static int watchdog_prepare_cpu(int cpu) static int watchdog_enable(int cpu) { struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - int err; + int err = 0; /* enable the perf event */ err = watchdog_nmi_enable(cpu); - if (err) - return err; + + /* Regardless of err above, fall through and start softlockup */ /* create the watchdog thread */ if (!p) { p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); if (IS_ERR(p)) { printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); - return PTR_ERR(p); + if (!err) + /* if hardlockup hasn't already set this */ + err = PTR_ERR(p); + goto out; } kthread_bind(p, cpu); per_cpu(watchdog_touch_ts, cpu) = 0; @@ -435,7 +441,8 @@ static int watchdog_enable(int cpu) wake_up_process(p); } - return 0; +out: + return err; } static void watchdog_disable(int cpu) @@ -547,7 +554,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #endif /* CONFIG_HOTPLUG_CPU */ } - return notifier_from_errno(err); + + /* + * hardlockup and softlockup are not important enough + * to block cpu bring up. Just always succeed and + * rely on printk output to flag problems. + */ + return NOTIFY_OK; } static struct notifier_block __cpuinitdata cpu_nfb = { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5ca7ce9..8859a41 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1291,7 +1291,7 @@ __acquires(&gcwq->lock) return true; spin_unlock_irq(&gcwq->lock); - /* CPU has come up inbetween, retry migration */ + /* CPU has come up in between, retry migration */ cpu_relax(); } } @@ -1366,8 +1366,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) worker->id = id; if (!on_unbound_cpu) - worker->task = kthread_create(worker_thread, worker, - "kworker/%u:%d", gcwq->cpu, id); + worker->task = kthread_create_on_node(worker_thread, + worker, + cpu_to_node(gcwq->cpu), + "kworker/%u:%d", gcwq->cpu, id); else worker->task = kthread_create(worker_thread, worker, "kworker/u:%d", id); |