From 7b595756ec1f49e0049a9e01a1298d53a7faaa15 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jun 2007 03:45:17 +0900 Subject: sysfs: kill unnecessary attribute->owner sysfs is now completely out of driver/module lifetime game. After deletion, a sysfs node doesn't access anything outside sysfs proper, so there's no reason to hold onto the attribute owners. Note that often the wrong modules were accounted for as owners leading to accessing removed modules. This patch kills now unnecessary attribute->owner. Note that with this change, userland holding a sysfs node does not prevent the backing module from being unloaded. For more info regarding lifetime rule cleanup, please read the following message. http://article.gmane.org/gmane.linux.kernel/510293 (tweaked by Greg to not delete the field just yet, to make it easier to merge things properly.) Signed-off-by: Tejun Heo Cc: Cornelia Huck Cc: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 9 +++------ kernel/params.c | 1 - 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 9bd93de..015d60c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -488,8 +488,7 @@ static void free_modinfo_##field(struct module *mod) \ mod->field = NULL; \ } \ static struct module_attribute modinfo_##field = { \ - .attr = { .name = __stringify(field), .mode = 0444, \ - .owner = THIS_MODULE }, \ + .attr = { .name = __stringify(field), .mode = 0444 }, \ .show = show_modinfo_##field, \ .setup = setup_modinfo_##field, \ .test = modinfo_##field##_exists, \ @@ -793,7 +792,7 @@ static ssize_t show_refcnt(struct module_attribute *mattr, } static struct module_attribute refcnt = { - .attr = { .name = "refcnt", .mode = 0444, .owner = THIS_MODULE }, + .attr = { .name = "refcnt", .mode = 0444 }, .show = show_refcnt, }; @@ -851,7 +850,7 @@ static ssize_t show_initstate(struct module_attribute *mattr, } static struct module_attribute initstate = { - .attr = { .name = "initstate", .mode = 0444, .owner = THIS_MODULE }, + .attr = { .name = "initstate", .mode = 0444 }, .show = show_initstate, }; @@ -1032,7 +1031,6 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, sattr->mattr.show = module_sect_show; sattr->mattr.store = NULL; sattr->mattr.attr.name = sattr->name; - sattr->mattr.attr.owner = mod; sattr->mattr.attr.mode = S_IRUGO; *(gattr++) = &(sattr++)->mattr.attr; } @@ -1090,7 +1088,6 @@ int module_add_modinfo_attrs(struct module *mod) if (!attr->test || (attr->test && attr->test(mod))) { memcpy(temp_attr, attr, sizeof(*temp_attr)); - temp_attr->attr.owner = mod; error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); ++temp_attr; } diff --git a/kernel/params.c b/kernel/params.c index e61c46c..effbaae 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -491,7 +491,6 @@ param_sysfs_setup(struct module_kobject *mk, pattr->mattr.show = param_attr_show; pattr->mattr.store = param_attr_store; pattr->mattr.attr.name = (char *)&kp->name[name_skip]; - pattr->mattr.attr.owner = mk->mod; pattr->mattr.attr.mode = kp->perm; *(gattr++) = &(pattr++)->mattr.attr; } -- cgit v1.1 From ed0321895182ffb6ecf210e066d87911b270d587 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 28 Jun 2007 15:55:21 -0400 Subject: security: Protection for exploiting null dereference using mmap Add a new security check on mmap operations to see if the user is attempting to mmap to low area of the address space. The amount of space protected is indicated by the new proc tunable /proc/sys/vm/mmap_min_addr and defaults to 0, preserving existing behavior. This patch uses a new SELinux security class "memprotect." Policy already contains a number of allow rules like a_t self:process * (unconfined_t being one of them) which mean that putting this check in the process class (its best current fit) would make it useless as all user processes, which we also want to protect against, would be allowed. By taking the memprotect name of the new class it will also make it possible for us to move some of the other memory protect permissions out of 'process' and into the new class next time we bump the policy version number (which I also think is a good future idea) Acked-by: Stephen Smalley Acked-by: Chris Wright Signed-off-by: Eric Paris Signed-off-by: James Morris --- kernel/sysctl.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 51f5dac..d93e13d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -949,6 +949,16 @@ static ctl_table vm_table[] = { .strategy = &sysctl_jiffies, }, #endif +#ifdef CONFIG_SECURITY + { + .ctl_name = CTL_UNNUMBERED, + .procname = "mmap_min_addr", + .data = &mmap_min_addr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, +#endif #if defined(CONFIG_X86_32) || \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { -- cgit v1.1 From 24da24de2eae0c277b85836e2b4b09cfafeea995 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 12 Jul 2007 08:12:04 +0200 Subject: relay: fix bogus cast in subbuf_splice_actor() The current code that sets the read position in subbuf_splice_actor may give erroneous results if the buffer size isn't a power of 2. This patch fixes the problem. Signed-off-by: Tom Zanussi Signed-off-by: Jens Axboe --- kernel/relay.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 3b299fb..7802697 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1074,7 +1074,9 @@ static int subbuf_splice_actor(struct file *in, unsigned int pidx, poff, total_len, subbuf_pages, ret; struct rchan_buf *rbuf = in->private_data; unsigned int subbuf_size = rbuf->chan->subbuf_size; - size_t read_start = ((size_t)*ppos) % rbuf->chan->alloc_size; + uint64_t pos = (uint64_t) *ppos; + uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size; + size_t read_start = (size_t) do_div(pos, alloc_size); size_t read_subbuf = read_start / subbuf_size; size_t padding = rbuf->padding[read_subbuf]; size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; -- cgit v1.1 From d3f35d98b3b87d2506289320375687c6e9bc53ed Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 12 Jul 2007 08:12:05 +0200 Subject: relay: fixup kerneldoc comment Change comment from kerneldoc to normal. Signed-off-by: Tom Zanussi Signed-off-by: Jens Axboe --- kernel/relay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 7802697..a615a8f 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1061,7 +1061,7 @@ static struct pipe_buf_operations relay_pipe_buf_ops = { .get = generic_pipe_buf_get, }; -/** +/* * subbuf_splice_actor - splice up to one subbuf's worth of data */ static int subbuf_splice_actor(struct file *in, -- cgit v1.1 From e127031f4f76dc367c5d2f9d883715730dd82f7d Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 11 Jul 2007 21:21:47 +0200 Subject: [PATCH] sched: fix prio_to_wmult[] for nice 1 There's a typo in the values in prio_to_wmult[] for nice level 1. While it did not cause bad CPU distribution, but caused more rescheduling between nice-0 and nice-1 tasks than necessary. Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9fbced6..2ab7fa8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -751,7 +751,7 @@ static const u32 prio_to_wmult[40] = { 184467, 230589, 288233, 360285, 450347, 562979, 703746, 879575, 1099582, 1374389, 717986, 2147483, 2684354, 3355443, 4194304, - 244160, 6557201, 8196502, 10250518, 12782640, + 5244160, 6557201, 8196502, 10250518, 12782640, 16025997, 19976592, 24970740, 31350126, 39045157, 49367440, 61356675, 76695844, 95443717, 119304647, 148102320, 186737708, 238609294, 286331153, -- cgit v1.1 From a5968df8737eda477d9d1038f5428ebd4d0884e1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 11 Jul 2007 21:21:47 +0200 Subject: [PATCH] sched: allow larger granularity Allow granularity up to 100 msecs, instead of 10 msecs. (needed on larger boxes) Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2ab7fa8..9088c2d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4778,7 +4778,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; static inline void sched_init_granularity(void) { unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long gran_limit = 10000000; + const unsigned long gran_limit = 100000000; sysctl_sched_granularity *= factor; if (sysctl_sched_granularity > gran_limit) -- cgit v1.1 From 45f384a64f0769bb9a3caf0516de88a629f48e61 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 11 Jul 2007 21:21:47 +0200 Subject: [PATCH] sched: remove stale version info from kernel/sched_debug.c kernel/sched_debug.c referred to CFS -v20, but there's no CFS versioning needed within the upstream kernel. Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sched_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 1baf87c..29f2c21 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -171,7 +171,7 @@ static int sched_debug_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Sched Debug Version: v0.04, cfs-v20, %s %.*s\n", + SEQ_printf(m, "Sched Debug Version: v0.05, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); -- cgit v1.1 From 4bd77321a833077c5c9ac7b9d284e261e4a8906e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 11 Jul 2007 21:21:47 +0200 Subject: [PATCH] sched: fix show_task()/show_tasks() output fix show_task()/show_tasks() output: - there's no sibling info anymore - the fields were not aligned properly with the description - get rid of the lazy-TLB output: it's been quite some time since we last had a bug there, and when we had a bug it wasnt helped a bit by this debug output. Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sched.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9088c2d..0559665 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4647,14 +4647,14 @@ static void show_task(struct task_struct *p) state = p->state ? __ffs(p->state) + 1 : 0; printk("%-13.13s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if (BITS_PER_LONG == 32) +#if BITS_PER_LONG == 32 if (state == TASK_RUNNING) - printk(" running "); + printk(" running "); else - printk(" %08lX ", thread_saved_pc(p)); + printk(" %08lx ", thread_saved_pc(p)); #else if (state == TASK_RUNNING) - printk(" running task "); + printk(" running task "); else printk(" %016lx ", thread_saved_pc(p)); #endif @@ -4666,11 +4666,7 @@ static void show_task(struct task_struct *p) free = (unsigned long)n - (unsigned long)end_of_stack(p); } #endif - printk("%5lu %5d %6d", free, p->pid, p->parent->pid); - if (!p->mm) - printk(" (L-TLB)\n"); - else - printk(" (NOTLB)\n"); + printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid); if (state != TASK_RUNNING) show_stack(p, NULL); @@ -4680,14 +4676,12 @@ void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; -#if (BITS_PER_LONG == 32) - printk("\n" - " free sibling\n"); - printk(" task PC stack pid father child younger older\n"); +#if BITS_PER_LONG == 32 + printk(KERN_INFO + " task PC stack pid father\n"); #else - printk("\n" - " free sibling\n"); - printk(" task PC stack pid father child younger older\n"); + printk(KERN_INFO + " task PC stack pid father\n"); #endif read_lock(&tasklist_lock); do_each_thread(g, p) { -- cgit v1.1 From 4fd885170bf13841ada921495b7b00c4b9971cf9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Jul 2007 21:43:55 +0200 Subject: CFS: Fix missing digit off in wmult table Roman Zippel noticed another inconsistency of the wmult table. wmult[16] has a missing digit. Signed-off-by: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0559665..3332bbb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -750,7 +750,7 @@ static const u32 prio_to_wmult[40] = { 48356, 60446, 75558, 94446, 118058, 147573, 184467, 230589, 288233, 360285, 450347, 562979, 703746, 879575, 1099582, 1374389, - 717986, 2147483, 2684354, 3355443, 4194304, + 1717986, 2147483, 2684354, 3355443, 4194304, 5244160, 6557201, 8196502, 10250518, 12782640, 16025997, 19976592, 24970740, 31350126, 39045157, 49367440, 61356675, 76695844, 95443717, 119304647, -- cgit v1.1 From f9153ee6c71cb9ab38de3b8ed66b1c3fa27c3f7d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 16 Jul 2007 09:46:30 +0200 Subject: [PATCH] sched: improve weight-array comments improve the comments around the wmult array (which controls the weight of niced tasks). Clarify that to achieve a 10% difference in CPU utilization, a weight multiplier of 1.25 has to be used. Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3332bbb..a7284bc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -736,7 +736,9 @@ static void update_curr_load(struct rq *rq, u64 now) * * The "10% effect" is relative and cumulative: from _any_ nice level, * if you go up 1 level, it's -10% CPU usage, if you go down 1 level - * it's +10% CPU usage. + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) */ static const int prio_to_weight[40] = { /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921, -- cgit v1.1 From 5714d2de93fbb156c5e45fb101a2b4f0cae8fbb7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 16 Jul 2007 09:46:31 +0200 Subject: [PATCH] sched: document prio_to_wmult[] document prio_to_wmult[]. Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a7284bc..90d22b7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -748,6 +748,13 @@ static const int prio_to_weight[40] = { /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15, }; +/* + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. + * + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ static const u32 prio_to_wmult[40] = { 48356, 60446, 75558, 94446, 118058, 147573, 184467, 230589, 288233, 360285, 450347, -- cgit v1.1 From e4af30be8fd0bed0e8f96e4e1ebd546a3dfa8f2b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 16 Jul 2007 09:46:31 +0200 Subject: [PATCH] sched: prettify prio_to_wmult[] prettify the prio_to_wmult[] array. (this could have saved us from the typos) Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 90d22b7..1c80766 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -756,14 +756,14 @@ static const int prio_to_weight[40] = { * into multiplications: */ static const u32 prio_to_wmult[40] = { - 48356, 60446, 75558, 94446, 118058, 147573, - 184467, 230589, 288233, 360285, 450347, - 562979, 703746, 879575, 1099582, 1374389, - 1717986, 2147483, 2684354, 3355443, 4194304, - 5244160, 6557201, 8196502, 10250518, 12782640, - 16025997, 19976592, 24970740, 31350126, 39045157, - 49367440, 61356675, 76695844, 95443717, 119304647, - 148102320, 186737708, 238609294, 286331153, +/* -20 */ 48356, 60446, 75558, 94446, 118058, +/* -15 */ 147573, 184467, 230589, 288233, 360285, +/* -10 */ 450347, 562979, 703746, 879575, 1099582, +/* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443, +/* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518, +/* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126, +/* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717, +/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; static inline void -- cgit v1.1 From db912f963909b3cbc3a059b7528f6a1a1eb6ffae Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 24 May 2007 12:23:10 +0300 Subject: HOTPLUG: Add CPU_DYING notifier KVM wants a notification when a cpu is about to die, so it can disable hardware extensions, but at a time when user processes cannot be scheduled on the cpu, so it doesn't try to use virtualization extensions after they have been disabled. This adds a CPU_DYING notification. The notification is called in atomic context on the doomed cpu. Signed-off-by: Avi Kivity --- kernel/cpu.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 208cf34..181ae70 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -103,11 +103,19 @@ static inline void check_for_tasks(int cpu) write_unlock_irq(&tasklist_lock); } +struct take_cpu_down_param { + unsigned long mod; + void *hcpu; +}; + /* Take this CPU down. */ -static int take_cpu_down(void *unused) +static int take_cpu_down(void *_param) { + struct take_cpu_down_param *param = _param; int err; + raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, + param->hcpu); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) @@ -127,6 +135,10 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) cpumask_t old_allowed, tmp; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; + struct take_cpu_down_param tcd_param = { + .mod = mod, + .hcpu = hcpu, + }; if (num_online_cpus() == 1) return -EBUSY; @@ -153,7 +165,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) set_cpus_allowed(current, tmp); mutex_lock(&cpu_bitmask_lock); - p = __stop_machine_run(take_cpu_down, NULL, cpu); + p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); mutex_unlock(&cpu_bitmask_lock); if (IS_ERR(p) || cpu_online(cpu)) { -- cgit v1.1 From ac076758b97d9e3d2c1557cfa412911e93cd0919 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 24 May 2007 12:33:15 +0300 Subject: HOTPLUG: Adapt cpuset hotplug callback to CPU_DYING CPU_DYING is called in atomic context, so don't try to take any locks. Signed-off-by: Avi Kivity --- kernel/cpuset.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4c49188..c4d123f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2138,6 +2138,9 @@ static void common_cpu_mem_hotplug_unplug(void) static int cpuset_handle_cpuhp(struct notifier_block *nb, unsigned long phase, void *cpu) { + if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) + return NOTIFY_DONE; + common_cpu_mem_hotplug_unplug(); return 0; } -- cgit v1.1 From eaa944afb206f3fc4393630811ee621b866e3255 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 15 Jul 2007 23:37:27 -0700 Subject: console: more buf for index parsing Change name to buf according to the usage as name + index Signed-off-by: Yinghai Lu Cc: Andi Kleen Cc: Bjorn Helgaas Cc: Russell King Cc: Gerd Hoffmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 0bbdeac..4961410 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -654,7 +654,7 @@ static void call_console_drivers(unsigned long start, unsigned long end) */ static int __init console_setup(char *str) { - char name[sizeof(console_cmdline[0].name)]; + char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ char *s, *options; int idx; @@ -662,27 +662,27 @@ static int __init console_setup(char *str) * Decode str into name, index, options. */ if (str[0] >= '0' && str[0] <= '9') { - strcpy(name, "ttyS"); - strncpy(name + 4, str, sizeof(name) - 5); + strcpy(buf, "ttyS"); + strncpy(buf + 4, str, sizeof(buf) - 5); } else { - strncpy(name, str, sizeof(name) - 1); + strncpy(buf, str, sizeof(buf) - 1); } - name[sizeof(name) - 1] = 0; + buf[sizeof(buf) - 1] = 0; if ((options = strchr(str, ',')) != NULL) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) - strcpy(name, "ttyS0"); + strcpy(buf, "ttyS0"); if (!strcmp(str, "ttyb")) - strcpy(name, "ttyS1"); + strcpy(buf, "ttyS1"); #endif - for (s = name; *s; s++) + for (s = buf; *s; s++) if ((*s >= '0' && *s <= '9') || *s == ',') break; idx = simple_strtoul(s, NULL, 10); *s = 0; - add_preferred_console(name, idx, options); + add_preferred_console(buf, idx, options); return 1; } __setup("console=", console_setup); @@ -709,7 +709,7 @@ int __init add_preferred_console(char *name, int idx, char *options) * See if this tty is not yet registered, and * if we have a slot free. */ - for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) + for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) if (strcmp(console_cmdline[i].name, name) == 0 && console_cmdline[i].index == idx) { selected_console = i; -- cgit v1.1 From d37bf60de0b4ddc1633cf278189d3c9bf28fe3d2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 15 Jul 2007 23:37:28 -0700 Subject: console: console handover to preferred console for earlyprintk=ttyS0,9600 console=tty0 console=ttyS0,9600n8 the handover will happen from earlyser0 to tty0. but what we want is to hand over to ttyS0. Later with serial-convert-early_uart-to-earlycon-for-8250.patch, console=tty0 console=uart8250,io,0x3f8,9600n8 will handover to ttyS0 instead of tty0. Signed-off-by: Yinghai Lu Cc: Andi Kleen Cc: Bjorn Helgaas Cc: Russell King Cc: Gerd Hoffmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 4961410..7ce9a8c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -985,12 +985,15 @@ void register_console(struct console *console) if (!(console->flags & CON_ENABLED)) return; - if (bootconsole) { + if (bootconsole && (console->flags & CON_CONSDEV)) { printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", bootconsole->name, bootconsole->index, console->name, console->index); unregister_console(bootconsole); console->flags &= ~CON_PRINTBUFFER; + } else { + printk(KERN_INFO "console [%s%d] enabled\n", + console->name, console->index); } /* -- cgit v1.1 From 18a8bd949d6adb311ea816125ff65050df1f3f6e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 15 Jul 2007 23:37:59 -0700 Subject: serial: convert early_uart to earlycon for 8250 Beacuse SERIAL_PORT_DFNS is removed from include/asm-i386/serial.h and include/asm-x86_64/serial.h. the serial8250_ports need to be probed late in serial initializing stage. the console_init=>serial8250_console_init=> register_console=>serial8250_console_setup will return -ENDEV, and console ttyS0 can not be enabled at that time. need to wait till uart_add_one_port in drivers/serial/serial_core.c to call register_console to get console ttyS0. that is too late. Make early_uart to use early_param, so uart console can be used earlier. Make it to be bootconsole with CON_BOOT flag, so can use console handover feature. and it will switch to corresponding normal serial console automatically. new command line will be: console=uart8250,io,0x3f8,9600n8 console=uart8250,mmio,0xff5e0000,115200n8 or earlycon=uart8250,io,0x3f8,9600n8 earlycon=uart8250,mmio,0xff5e0000,115200n8 it will print in very early stage: Early serial console at I/O port 0x3f8 (options '9600n8') console [uart0] enabled later for console it will print: console handover: boot [uart0] -> real [ttyS0] Signed-off-by: Cc: Andi Kleen Cc: Bjorn Helgaas Cc: Russell King Cc: Gerd Hoffmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 7ce9a8c..f46cc6d 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -726,6 +726,25 @@ int __init add_preferred_console(char *name, int idx, char *options) return 0; } +int __init update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) +{ + struct console_cmdline *c; + int i; + + for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) + if (strcmp(console_cmdline[i].name, name) == 0 && + console_cmdline[i].index == idx) { + c = &console_cmdline[i]; + memcpy(c->name, name_new, sizeof(c->name)); + c->name[sizeof(c->name) - 1] = 0; + c->options = options; + c->index = idx_new; + return i; + } + /* not found */ + return -1; +} + #ifndef CONFIG_DISABLE_CONSOLE_SUSPEND /** * suspend_console - suspend the console subsystem @@ -942,6 +961,9 @@ void register_console(struct console *console) if (preferred_console < 0 || bootconsole || !console_drivers) preferred_console = selected_console; + if (console->early_setup) + console->early_setup(); + /* * See if we want to use this console driver. If we * didn't select a console we take the first one -- cgit v1.1 From f0c0b2b808f232741eadac272bd4bc51f18df0f4 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Sun, 15 Jul 2007 23:38:01 -0700 Subject: change zonelist order: zonelist order selection logic Make zonelist creation policy selectable from sysctl/boot option v6. This patch makes NUMA's zonelist (of pgdat) order selectable. Available order are Default(automatic)/ Node-based / Zone-based. [Default Order] The kernel selects Node-based or Zone-based order automatically. [Node-based Order] This policy treats the locality of memory as the most important parameter. Zonelist order is created by each zone's locality. This means lower zones (ex. ZONE_DMA) can be used before higher zone (ex. ZONE_NORMAL) exhausion. IOW. ZONE_DMA will be in the middle of zonelist. current 2.6.21 kernel uses this. Pros. * A user can expect local memory as much as possible. Cons. * lower zone will be exhansted before higher zone. This may cause OOM_KILL. Maybe suitable if ZONE_DMA is relatively big and you never see OOM_KILL because of ZONE_DMA exhaution and you need the best locality. (example) assume 2 node NUMA. node(0) has ZONE_DMA/ZONE_NORMAL, node(1) has ZONE_NORMAL. *node(0)'s memory allocation order: node(0)'s NORMAL -> node(0)'s DMA -> node(1)'s NORMAL. *node(1)'s memory allocation order: node(1)'s NORMAL -> node(0)'s NORMAL -> node(0)'s DMA. [Zone-based order] This policy treats the zone type as the most important parameter. Zonelist order is created by zone-type order. This means lower zone never be used bofere higher zone exhaustion. IOW. ZONE_DMA will be always at the tail of zonelist. Pros. * OOM_KILL(bacause of lower zone) occurs only if the whole zones are exhausted. Cons. * memory locality may not be best. (example) assume 2 node NUMA. node(0) has ZONE_DMA/ZONE_NORMAL, node(1) has ZONE_NORMAL. *node(0)'s memory allocation order: node(0)'s NORMAL -> node(1)'s NORMAL -> node(0)'s DMA. *node(1)'s memory allocation order: node(1)'s NORMAL -> node(0)'s NORMAL -> node(0)'s DMA. bootoption "numa_zonelist_order=" and proc/sysctl is supporetd. command: %echo N > /proc/sys/vm/numa_zonelist_order Will rebuild zonelist in Node-based order. command: %echo Z > /proc/sys/vm/numa_zonelist_order Will rebuild zonelist in Zone-based order. Thanks to Lee Schermerhorn, he gives me much help and codes. [Lee.Schermerhorn@hp.com: add check_highest_zone to build_zonelists_in_zone_order] [akpm@linux-foundation.org: build fix] Signed-off-by: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Cc: Christoph Lameter Cc: Andi Kleen Cc: "jesse.barnes@intel.com" Signed-off-by: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d93e13d..ccaebbb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -958,6 +958,17 @@ static ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_doulongvec_minmax, }, +#ifdef CONFIG_NUMA + { + .ctl_name = CTL_UNNUMBERED, + .procname = "numa_zonelist_order", + .data = &numa_zonelist_order, + .maxlen = NUMA_ZONELIST_ORDER_LEN, + .mode = 0644, + .proc_handler = &numa_zonelist_order_handler, + .strategy = &sysctl_string, + }, +#endif #endif #if defined(CONFIG_X86_32) || \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) -- cgit v1.1 From 98011f569e2ae1e4ae394f6e23faa16676d50de4 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Sun, 15 Jul 2007 23:38:17 -0700 Subject: mm: fix improper .init-type section references .. which modpost started warning about. Signed-off-by: Jan Beulich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index bbd51b8..a404f7e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -215,7 +215,7 @@ int kthread_stop(struct task_struct *k) EXPORT_SYMBOL(kthread_stop); -static __init void kthreadd_setup(void) +static noinline __init_refok void kthreadd_setup(void) { struct task_struct *tsk = current; -- cgit v1.1 From e18eecb8b35703a5eea73ee2b45324262029e62c Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Sun, 15 Jul 2007 23:38:48 -0700 Subject: Add generic exit-time stack-depth checking to CONFIG_DEBUG_STACK_USAGE Add generic exit-time stack-depth checking to CONFIG_DEBUG_STACK_USAGE. This also adds UML support. Tested on UML and i386. [akpm@linux-foundation.org: cleanups, speedups, tweaks] Signed-off-by: Jeff Dike Cc: Oleg Nesterov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index ca6a11b..64a5263 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -858,6 +858,34 @@ static void exit_notify(struct task_struct *tsk) release_task(tsk); } +#ifdef CONFIG_DEBUG_STACK_USAGE +static void check_stack_usage(void) +{ + static DEFINE_SPINLOCK(low_water_lock); + static int lowest_to_date = THREAD_SIZE; + unsigned long *n = end_of_stack(current); + unsigned long free; + + while (*n == 0) + n++; + free = (unsigned long)n - (unsigned long)end_of_stack(current); + + if (free >= lowest_to_date) + return; + + spin_lock(&low_water_lock); + if (free < lowest_to_date) { + printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " + "left\n", + current->comm, free); + lowest_to_date = free; + } + spin_unlock(&low_water_lock); +} +#else +static inline void check_stack_usage(void) {} +#endif + fastcall NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; @@ -949,6 +977,7 @@ fastcall NORET_TYPE void do_exit(long code) exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); + check_stack_usage(); exit_thread(); cpuset_exit(tsk); exit_keys(tsk); -- cgit v1.1 From 6175ecfed3c81d388735c75f7a0ad08dc4de02d3 Mon Sep 17 00:00:00 2001 From: Sripathi Kodi Date: Sun, 15 Jul 2007 23:39:26 -0700 Subject: Use write_trylock_irqsave in ptrace_attach This patch makes ptrace_attach use write_trylock_irqsave(). [akpm@linux-foundation.org: remove unneeded initialisation] Signed-off-by: Sripathi Kodi Cc: Ingo Molnar Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ad7949a..b1d11f1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -161,6 +161,7 @@ int ptrace_may_attach(struct task_struct *task) int ptrace_attach(struct task_struct *task) { int retval; + unsigned long flags; audit_ptrace(task); @@ -181,9 +182,7 @@ repeat: * cpu's that may have task_lock). */ task_lock(task); - local_irq_disable(); - if (!write_trylock(&tasklist_lock)) { - local_irq_enable(); + if (!write_trylock_irqsave(&tasklist_lock, flags)) { task_unlock(task); do { cpu_relax(); @@ -211,7 +210,7 @@ repeat: force_sig_specific(SIGSTOP, task); bad: - write_unlock_irq(&tasklist_lock); + write_unlock_irqrestore(&tasklist_lock, flags); task_unlock(task); out: return retval; -- cgit v1.1 From 7c3f1a573237b90ef331267260358a0ec4ac9079 Mon Sep 17 00:00:00 2001 From: Tomas Janousek Date: Sun, 15 Jul 2007 23:39:41 -0700 Subject: Introduce boot based time The commits 411187fb05cd11676b0979d9fbf3291db69dbce2 (GTOD: persistent clock support) c1d370e167d66b10bca3b602d3740405469383de (i386: use GTOD persistent clock support) changed the monotonic time so that it no longer jumps after resume, but it's not possible to use it for boot time and process start time calculations then. Also, the uptime no longer increases during suspend. I add a variable to track the wall_to_monotonic changes, a function to get the real boot time and a function to get the boot based time from the monotonic one. [akpm@linux-foundation.org: remove exports, add comment] Signed-off-by: Tomas Janousek Cc: Tomas Smetana Cc: John Stultz Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/timekeeping.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3d1042f..728cedf 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -36,9 +36,17 @@ EXPORT_SYMBOL(xtime_lock); * at zero at system boot time, so wall_to_monotonic will be negative, * however, we will ALWAYS keep the tv_nsec part positive so we can use * the usual normalization. + * + * wall_to_monotonic is moved after resume from suspend for the monotonic + * time not to jump. We need to add total_sleep_time to wall_to_monotonic + * to get the real boot based time offset. + * + * - wall_to_monotonic is no longer the boot time, getboottime must be + * used instead. */ struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); +static unsigned long total_sleep_time; /* seconds */ EXPORT_SYMBOL(xtime); @@ -251,6 +259,7 @@ void __init timekeeping_init(void) xtime.tv_nsec = 0; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); + total_sleep_time = 0; write_sequnlock_irqrestore(&xtime_lock, flags); } @@ -282,6 +291,7 @@ static int timekeeping_resume(struct sys_device *dev) xtime.tv_sec += sleep_length; wall_to_monotonic.tv_sec -= sleep_length; + total_sleep_time += sleep_length; } /* re-base the last cycle value */ clock->cycle_last = clocksource_read(clock); @@ -476,3 +486,30 @@ void update_wall_time(void) change_clocksource(); update_vsyscall(&xtime, clock); } + +/** + * getboottime - Return the real time of system boot. + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + * + * This is based on the wall_to_monotonic offset and the total suspend + * time. Calls to settimeofday will affect the value returned (which + * basically means that however wrong your real time clock is at boot time, + * you get the right time here). + */ +void getboottime(struct timespec *ts) +{ + set_normalized_timespec(ts, + - (wall_to_monotonic.tv_sec + total_sleep_time), + - wall_to_monotonic.tv_nsec); +} + +/** + * monotonic_to_bootbased - Convert the monotonic time to boot based. + * @ts: pointer to the timespec to be converted + */ +void monotonic_to_bootbased(struct timespec *ts) +{ + ts->tv_sec += total_sleep_time; +} -- cgit v1.1 From 924b42d5a2dbe508407a0a6290d3751f826bccdd Mon Sep 17 00:00:00 2001 From: Tomas Janousek Date: Sun, 15 Jul 2007 23:39:42 -0700 Subject: Use boot based time for process start time and boot time in /proc Commit 411187fb05cd11676b0979d9fbf3291db69dbce2 caused boot time to move and process start times to become invalid after suspend. Using boot based time for those restores the old behaviour and fixes the issue. [akpm@linux-foundation.org: little cleanup] Signed-off-by: Tomas Janousek Cc: Tomas Smetana Acked-by: John Stultz Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index da3a155..344d693 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1059,6 +1059,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); + p->real_start_time = p->start_time; + monotonic_to_bootbased(&p->real_start_time); p->security = NULL; p->io_context = NULL; p->io_wait = NULL; -- cgit v1.1 From d62141414a55ff3f1410b27db2a95224446e77a4 Mon Sep 17 00:00:00 2001 From: Tomas Janousek Date: Sun, 15 Jul 2007 23:39:42 -0700 Subject: Use boot based time for uptime in /proc Commit 411187fb05cd11676b0979d9fbf3291db69dbce2 caused uptime not to increase during suspend. This may cause confusion so I restore the old behaviour by using the boot based time instead of monotonic for uptime. Signed-off-by: Tomas Janousek Acked-by: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 1a69705..1ab3106 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1114,6 +1114,7 @@ int do_sysinfo(struct sysinfo *info) getnstimeofday(&tp); tp.tv_sec += wall_to_monotonic.tv_sec; tp.tv_nsec += wall_to_monotonic.tv_nsec; + monotonic_to_bootbased(&tp); if (tp.tv_nsec - NSEC_PER_SEC >= 0) { tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; tp.tv_sec++; -- cgit v1.1 From 85653af7d488702165eba72c6c1dd0250fae4e70 Mon Sep 17 00:00:00 2001 From: Satoru Takeuchi Date: Sun, 15 Jul 2007 23:39:47 -0700 Subject: Fix stop_machine_run problem with naughty real time process stop_machine_run() does its work on "kstopmachine" thread having max priority. However that thread get such priority after woken up. Therefore, in the following case ... - "kstopmachine" try to run on CPU1 - There is a real time process which doesn't relinquish CPU time voluntary on CPU1 ... "kstopmachine" can't start to run and the CPU on which stop_machine_run() is runing hangs up. To fix this problem, call sched_setscheduler() before waking up that thread. Signed-off-by: Satoru Takeuchi Cc: Rusty Russell Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Ashok Raj Cc: Gautham R Shenoy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index fcee2a8..319821e 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -93,10 +93,6 @@ static void stopmachine_set_state(enum stopmachine_state state) static int stop_machine(void) { int i, ret = 0; - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - - /* One high-prio thread per cpu. We'll do this one. */ - sched_setscheduler(current, SCHED_FIFO, ¶m); atomic_set(&stopmachine_thread_ack, 0); stopmachine_num_threads = 0; @@ -189,6 +185,10 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, p = kthread_create(do_stop, &smdata, "kstopmachine"); if (!IS_ERR(p)) { + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + + /* One high-prio thread per cpu. We'll do this one. */ + sched_setscheduler(p, SCHED_FIFO, ¶m); kthread_bind(p, cpu); wake_up_process(p); wait_for_completion(&smdata.done); -- cgit v1.1 From 1c6b4aa94576eee6dec3b8011f60d7f666db90b0 Mon Sep 17 00:00:00 2001 From: Satoru Takeuchi Date: Sun, 15 Jul 2007 23:39:48 -0700 Subject: cpu hotplug: fix ksoftirqd termination on cpu hotplug with naughty realtime process Fix ksoftirqd termination on cpu hotplug with naughty real time process. Assuming the following case: - Try to hot remove CPU2 from CPU1. - There is a real time process on CPU2, and that process doesn't sleep at all. - That rt process and ksoftirqd/2 is migrated to the CPU0 Then ksoftirqd/2 can't stop becasue that rt process runs everlastingly on CPU0, and CPU1 waiting the ksoftirqd/2's termination hangs up. To fix this problem, set the priority of ksoftirqd/2 to max one before kthread_stop(). [akpm@linux-foundation.org: fix warning] Signed-off-by: Satoru Takeuchi Cc: Rusty Russell Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Ashok Raj Cc: Gautham R Shenoy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 73217a9..8de2677 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -614,12 +614,16 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, kthread_bind(per_cpu(ksoftirqd, hotcpu), any_online_cpu(cpu_online_map)); case CPU_DEAD: - case CPU_DEAD_FROZEN: + case CPU_DEAD_FROZEN: { + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + p = per_cpu(ksoftirqd, hotcpu); per_cpu(ksoftirqd, hotcpu) = NULL; + sched_setscheduler(p, SCHED_FIFO, ¶m); kthread_stop(p); takeover_tasklets(hotcpu); break; + } #endif /* CONFIG_HOTPLUG_CPU */ } return NOTIFY_OK; -- cgit v1.1 From 708f4b522371da5e6c615a49e1844195aff84cb4 Mon Sep 17 00:00:00 2001 From: Pavel Emelianov Date: Sun, 15 Jul 2007 23:39:54 -0700 Subject: Make /proc/modules use seq_list_xxx helpers Here there is not need even in .show callback altering. The original code passes list_head in *v. Signed-off-by: Pavel Emelianov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 015d60c..7a1a4d3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2232,26 +2232,13 @@ unsigned long module_kallsyms_lookup_name(const char *name) /* Called by the /proc file system to return a list of modules. */ static void *m_start(struct seq_file *m, loff_t *pos) { - struct list_head *i; - loff_t n = 0; - mutex_lock(&module_mutex); - list_for_each(i, &modules) { - if (n++ == *pos) - break; - } - if (i == &modules) - return NULL; - return i; + return seq_list_start(&modules, *pos); } static void *m_next(struct seq_file *m, void *p, loff_t *pos) { - struct list_head *i = p; - (*pos)++; - if (i->next == &modules) - return NULL; - return i->next; + return seq_list_next(p, &modules, pos); } static void m_stop(struct seq_file *m, void *p) -- cgit v1.1 From 45807a1df9f51d28d0ff0c6bcf900c210411d7c9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 15 Jul 2007 23:40:10 -0700 Subject: vdso: print fatal signals Add the print-fatal-signals=1 boot option and the /proc/sys/kernel/print-fatal-signals runtime switch. This feature prints some minimal information about userspace segfaults to the kernel console. This is useful to find early bootup bugs where userspace debugging is very hard. Defaults to off. [akpm@linux-foundation.org: Don't add new sysctl numbers] Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 33 +++++++++++++++++++++++++++++++++ kernel/sysctl.c | 9 +++++++++ 2 files changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f940560..39d1227 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -718,6 +718,37 @@ out_set: #define LEGACY_QUEUE(sigptr, sig) \ (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig))) +int print_fatal_signals; + +static void print_fatal_signal(struct pt_regs *regs, int signr) +{ + printk("%s/%d: potentially unexpected fatal signal %d.\n", + current->comm, current->pid, signr); + +#ifdef __i386__ + printk("code at %08lx: ", regs->eip); + { + int i; + for (i = 0; i < 16; i++) { + unsigned char insn; + + __get_user(insn, (unsigned char *)(regs->eip + i)); + printk("%02x ", insn); + } + } +#endif + printk("\n"); + show_regs(regs); +} + +static int __init setup_print_fatal_signals(char *str) +{ + get_option (&str, &print_fatal_signals); + + return 1; +} + +__setup("print-fatal-signals=", setup_print_fatal_signals); static int specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) @@ -1855,6 +1886,8 @@ relock: * Anything else is fatal, maybe with a core dump. */ current->flags |= PF_SIGNALED; + if ((signr != SIGKILL) && print_fatal_signals) + print_fatal_signal(regs, signr); if (sig_kernel_coredump(signr)) { /* * If it was able to dump core, this kills all diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ccaebbb..2cce228 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,6 +61,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, /* External variables not in a header file. */ extern int C_A_D; +extern int print_fatal_signals; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern int sysctl_panic_on_oom; @@ -340,6 +341,14 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif + { + .ctl_name = CTL_UNNUMBERED, + .procname = "print-fatal-signals", + .data = &print_fatal_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef __sparc__ { .ctl_name = KERN_SPARC_REBOOT, -- cgit v1.1 From c2aef333c98b41eeb0f0d55b7faa7d4625a6160b Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Sun, 15 Jul 2007 23:40:11 -0700 Subject: Reduce cpuset.c write_lock_irq() to read_lock() cpuset.c:update_nodemask() uses a write_lock_irq() on tasklist_lock to block concurrent forks; a read_lock() suffices and is less intrusive. Signed-off-by: Paul Menage Acked-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4c49188..824b1c0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -981,10 +981,10 @@ static int update_nodemask(struct cpuset *cs, char *buf) mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); if (!mmarray) goto done; - write_lock_irq(&tasklist_lock); /* block fork */ + read_lock(&tasklist_lock); /* block fork */ if (atomic_read(&cs->count) <= ntasks) break; /* got enough */ - write_unlock_irq(&tasklist_lock); /* try again */ + read_unlock(&tasklist_lock); /* try again */ kfree(mmarray); } @@ -1006,7 +1006,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) continue; mmarray[n++] = mm; } while_each_thread(g, p); - write_unlock_irq(&tasklist_lock); + read_unlock(&tasklist_lock); /* * Now that we've dropped the tasklist spinlock, we can -- cgit v1.1 From 78c1b0657475dbafa008c71e3ccdc32141d8c7c7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 15 Jul 2007 23:40:15 -0700 Subject: Remove clockevents_{release,request}_device Not called by anything in tree. Signed-off-by: Andi Kleen Acked-by: Ingo Molnar Cc: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/clockevents.c | 41 ----------------------------------------- 1 file changed, 41 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 76212b2..2ad1c37 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -205,47 +205,6 @@ void clockevents_exchange_device(struct clock_event_device *old, } /** - * clockevents_request_device - */ -struct clock_event_device *clockevents_request_device(unsigned int features, - cpumask_t cpumask) -{ - struct clock_event_device *cur, *dev = NULL; - struct list_head *tmp; - - spin_lock(&clockevents_lock); - - list_for_each(tmp, &clockevent_devices) { - cur = list_entry(tmp, struct clock_event_device, list); - - if ((cur->features & features) == features && - cpus_equal(cpumask, cur->cpumask)) { - if (!dev || dev->rating < cur->rating) - dev = cur; - } - } - - clockevents_exchange_device(NULL, dev); - - spin_unlock(&clockevents_lock); - - return dev; -} - -/** - * clockevents_release_device - */ -void clockevents_release_device(struct clock_event_device *dev) -{ - spin_lock(&clockevents_lock); - - clockevents_exchange_device(dev, NULL); - clockevents_notify_released(); - - spin_unlock(&clockevents_lock); -} - -/** * clockevents_notify - notification about relevant events */ void clockevents_notify(unsigned long reason, void *arg) -- cgit v1.1 From e84845c4bf9a00533352e5805b35f42acdb04a1e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 15 Jul 2007 23:40:25 -0700 Subject: add printk.time option, deprecate 'time' Allow printk_time to be enabled or disabled at boot time. Previously it could be enabled only, but not disabled. Change printk_time from an int to a bool since that's what it is. Make its logical (exposed) name just be "time" (was "printk_time"). Note: Changes kernel boot option syntax from "time" to "printk.time=value". Since printk_time is declared as a module_param, it can also be changed at run-time by modifying /sys/module/printk/parameters/time to a value of 1/Y/y to enabled it or 0/N/n to disable it. Since printk_time is declared as a module_param, its value can also be set at boot-time by using linux printk.time= If the "time" boot option is used, print a message that it is deprecated and will be removed. Note its planned removal in feature-removal-schedule.txt. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index f46cc6d..fccacf7 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -449,13 +449,16 @@ static int printk_time = 1; #else static int printk_time = 0; #endif -module_param(printk_time, int, S_IRUGO | S_IWUSR); +module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static int __init printk_time_setup(char *str) { if (*str) return 0; printk_time = 1; + printk(KERN_NOTICE "The 'time' option is deprecated and " + "is scheduled for removal in early 2008\n"); + printk(KERN_NOTICE "Use 'printk.time=' instead\n"); return 1; } -- cgit v1.1 From c5c061b8f9726bc2c25e19dec227933a13d1e6b7 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Sun, 15 Jul 2007 23:40:30 -0700 Subject: Add a flag to indicate deferrable timers in /proc/timer_stats Add a flag in /proc/timer_stats to indicate deferrable timers. This will let developers/users to differentiate between types of tiemrs in /proc/timer_stats. Deferrable timer and normal timer will appear in /proc/timer_stats as below. 10D, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) 10, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) Also version of timer_stats changes from v0.1 to v0.2 Signed-off-by: Venkatesh Pallipadi Acked-by: Ingo Molnar Cc: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/timer_stats.c | 14 +++++++++++--- kernel/timer.c | 14 ++++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 3216937..9b8a826 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -68,6 +68,7 @@ struct entry { * Number of timeout events: */ unsigned long count; + unsigned int timer_flag; /* * We save the command-line string to preserve @@ -231,7 +232,8 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) * incremented. Otherwise the timer is registered in a free slot. */ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, - void *timerf, char * comm) + void *timerf, char *comm, + unsigned int timer_flag) { /* * It doesnt matter which lock we take: @@ -249,6 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, input.start_func = startf; input.expire_func = timerf; input.pid = pid; + input.timer_flag = timer_flag; spin_lock_irqsave(lock, flags); if (!active) @@ -295,7 +298,7 @@ static int tstats_show(struct seq_file *m, void *v) period = ktime_to_timespec(time); ms = period.tv_nsec / 1000000; - seq_puts(m, "Timer Stats Version: v0.1\n"); + seq_puts(m, "Timer Stats Version: v0.2\n"); seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); if (atomic_read(&overflow_count)) seq_printf(m, "Overflow: %d entries\n", @@ -303,8 +306,13 @@ static int tstats_show(struct seq_file *m, void *v) for (i = 0; i < nr_entries; i++) { entry = entries + i; - seq_printf(m, "%4lu, %5d %-16s ", + if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { + seq_printf(m, "%4luD, %5d %-16s ", entry->count, entry->pid, entry->comm); + } else { + seq_printf(m, " %4lu, %5d %-16s ", + entry->count, entry->pid, entry->comm); + } print_name_offset(m, (unsigned long)entry->start_func); seq_puts(m, " ("); diff --git a/kernel/timer.c b/kernel/timer.c index 1ab3106..1258371 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -305,6 +305,20 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); timer->start_pid = current->pid; } + +static void timer_stats_account_timer(struct timer_list *timer) +{ + unsigned int flag = 0; + + if (unlikely(tbase_get_deferrable(timer->base))) + flag |= TIMER_STATS_FLAG_DEFERRABLE; + + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm, flag); +} + +#else +static void timer_stats_account_timer(struct timer_list *timer) {} #endif /** -- cgit v1.1 From aa0ac36518be648dda3a32f0b37a8b2b546e1b24 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 15 Jul 2007 23:40:39 -0700 Subject: Remove capability.h from mm.h I forgot to remove capability.h from mm.h while removing sched.h! This patch remedies that, because the only inline function which was using CAP_something was made out of line. Cross-compile tested without regressions on: all powerpc defconfigs all mips defconfigs all m68k defconfigs all arm defconfigs all ia64 defconfigs alpha alpha-allnoconfig alpha-defconfig alpha-up arm i386 i386-allnoconfig i386-defconfig i386-up ia64 ia64-allnoconfig ia64-defconfig ia64-up m68k mips parisc parisc-allnoconfig parisc-defconfig parisc-up powerpc powerpc-up s390 s390-allnoconfig s390-defconfig s390-up sparc sparc-allnoconfig sparc-defconfig sparc-up sparc64 sparc64-allnoconfig sparc64-defconfig sparc64-up um-x86_64 x86_64 x86_64-allnoconfig x86_64-defconfig x86_64-up as well as my two usual configs. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index cf53bb5..438c6b7 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -13,7 +13,7 @@ #include #include #include - +#include #include #include -- cgit v1.1 From b663a79c191508f27cd885224b592a878c0ba0f6 Mon Sep 17 00:00:00 2001 From: Maxim Uvarov Date: Sun, 15 Jul 2007 23:40:48 -0700 Subject: taskstats: add context-switch counters Make available to the user the following task and process performance statistics: * Involuntary Context Switches (task_struct->nivcsw) * Voluntary Context Switches (task_struct->nvcsw) Statistics information is available from: 1. taskstats interface (Documentation/accounting/) 2. /proc/PID/status (task only). This data is useful for detecting hyperactivity patterns between processes. [akpm@linux-foundation.org: cleanup] Signed-off-by: Maxim Uvarov Cc: Shailabh Nagar Cc: Balbir Singh Cc: Jay Lan Cc: Jonathan Lim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 906cae7..059431e 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -196,6 +196,8 @@ static int fill_pid(pid_t pid, struct task_struct *tsk, /* fill in basic acct fields */ stats->version = TASKSTATS_VERSION; + stats->nvcsw = tsk->nvcsw; + stats->nivcsw = tsk->nivcsw; bacct_add_tsk(stats, tsk); /* fill in extended acct fields */ @@ -242,6 +244,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, */ delayacct_add_tsk(stats, tsk); + stats->nvcsw += tsk->nvcsw; + stats->nivcsw += tsk->nivcsw; } while_each_thread(first, tsk); unlock_task_sighand(first, &flags); -- cgit v1.1 From 4f27c00bf80f122513d3a5be16ed851573164534 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Sun, 15 Jul 2007 23:40:55 -0700 Subject: Improve behaviour of spurious IRQ detect Currently we handle spurious IRQ activity based upon seeing a lot of invalid interrupts, and we clear things back on the base of lots of valid interrupts. Unfortunately in some cases you get legitimate invalid interrupts caused by timing asynchronicity between the PCI bus and the APIC bus when disabling interrupts and pulling other tricks. In this case although the spurious IRQs are not a problem our unhandled counters didn't clear and they act as a slow running timebomb. (This is effectively what the serial port/tty problem that was fixed by clearing counters when registering a handler showed up) It's easy enough to add a second parameter - time. This means that if we see a regular stream of harmless spurious interrupts which are not harming processing we don't go off and do something stupid like disable the IRQ after a month of running. OTOH lockups and performance killers show up a lot more than 10/second [akpm@linux-foundation.org: cleanup] Signed-off-by: Alan Cox Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/spurious.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index bd9e272..32b1619 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -172,7 +172,17 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { if (unlikely(action_ret != IRQ_HANDLED)) { - desc->irqs_unhandled++; + /* + * If we are seeing only the odd spurious IRQ caused by + * bus asynchronicity then don't eventually trigger an error, + * otherwise the couter becomes a doomsday timer for otherwise + * working systems + */ + if (jiffies - desc->last_unhandled > HZ/10) + desc->irqs_unhandled = 1; + else + desc->irqs_unhandled++; + desc->last_unhandled = jiffies; if (unlikely(action_ret != IRQ_NONE)) report_bad_irq(irq, desc, action_ret); } -- cgit v1.1 From 522ed7767e800cff6c650ec64b0ee0677303119c Mon Sep 17 00:00:00 2001 From: Miloslav Trmac Date: Sun, 15 Jul 2007 23:40:56 -0700 Subject: Audit: add TTY input auditing Add TTY input auditing, used to audit system administrator's actions. This is required by various security standards such as DCID 6/3 and PCI to provide non-repudiation of administrator's actions and to allow a review of past actions if the administrator seems to overstep their duties or if the system becomes misconfigured for unknown reasons. These requirements do not make it necessary to audit TTY output as well. Compared to an user-space keylogger, this approach records TTY input using the audit subsystem, correlated with other audit events, and it is completely transparent to the user-space application (e.g. the console ioctls still work). TTY input auditing works on a higher level than auditing all system calls within the session, which would produce an overwhelming amount of mostly useless audit events. Add an "audit_tty" attribute, inherited across fork (). Data read from TTYs by process with the attribute is sent to the audit subsystem by the kernel. The audit netlink interface is extended to allow modifying the audit_tty attribute, and to allow sending explanatory audit events from user-space (for example, a shell might send an event containing the final command, after the interactive command-line editing and history expansion is performed, which might be difficult to decipher from the TTY input alone). Because the "audit_tty" attribute is inherited across fork (), it would be set e.g. for sshd restarted within an audited session. To prevent this, the audit_tty attribute is cleared when a process with no open TTY file descriptors (e.g. after daemon startup) opens a TTY. See https://www.redhat.com/archives/linux-audit/2007-June/msg00000.html for a more detailed rationale document for an older version of this patch. [akpm@linux-foundation.org: build fix] Signed-off-by: Miloslav Trmac Cc: Al Viro Cc: Alan Cox Cc: Paul Fulghum Cc: Casey Schaufler Cc: Steve Grubb Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++----- kernel/audit.h | 1 - kernel/auditsc.c | 3 -- kernel/exit.c | 2 ++ kernel/fork.c | 3 ++ 5 files changed, 93 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d13276d..5ce8851 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -58,6 +58,7 @@ #include #include #include +#include #include "audit.h" @@ -423,6 +424,31 @@ static int kauditd_thread(void *dummy) return 0; } +static int audit_prepare_user_tty(pid_t pid, uid_t loginuid) +{ + struct task_struct *tsk; + int err; + + read_lock(&tasklist_lock); + tsk = find_task_by_pid(pid); + err = -ESRCH; + if (!tsk) + goto out; + err = 0; + + spin_lock_irq(&tsk->sighand->siglock); + if (!tsk->signal->audit_tty) + err = -EPERM; + spin_unlock_irq(&tsk->sighand->siglock); + if (err) + goto out; + + tty_audit_push_task(tsk, loginuid); +out: + read_unlock(&tasklist_lock); + return err; +} + int audit_send_list(void *_dest) { struct audit_netlink_list *dest = _dest; @@ -511,6 +537,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) case AUDIT_DEL: case AUDIT_DEL_RULE: case AUDIT_SIGNAL_INFO: + case AUDIT_TTY_GET: + case AUDIT_TTY_SET: if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) err = -EPERM; break; @@ -622,6 +650,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) err = audit_filter_user(&NETLINK_CB(skb), msg_type); if (err == 1) { err = 0; + if (msg_type == AUDIT_USER_TTY) { + err = audit_prepare_user_tty(pid, loginuid); + if (err) + break; + } ab = audit_log_start(NULL, GFP_KERNEL, msg_type); if (ab) { audit_log_format(ab, @@ -638,8 +671,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) " subj=%s", ctx); kfree(ctx); } - audit_log_format(ab, " msg='%.1024s'", - (char *)data); + if (msg_type != AUDIT_USER_TTY) + audit_log_format(ab, " msg='%.1024s'", + (char *)data); + else { + int size; + + audit_log_format(ab, " msg="); + size = nlmsg_len(nlh); + audit_log_n_untrustedstring(ab, size, + data); + } audit_set_pid(ab, pid); audit_log_end(ab); } @@ -730,6 +772,45 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) 0, 0, sig_data, sizeof(*sig_data) + len); kfree(sig_data); break; + case AUDIT_TTY_GET: { + struct audit_tty_status s; + struct task_struct *tsk; + + read_lock(&tasklist_lock); + tsk = find_task_by_pid(pid); + if (!tsk) + err = -ESRCH; + else { + spin_lock_irq(&tsk->sighand->siglock); + s.enabled = tsk->signal->audit_tty != 0; + spin_unlock_irq(&tsk->sighand->siglock); + } + read_unlock(&tasklist_lock); + audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0, + &s, sizeof(s)); + break; + } + case AUDIT_TTY_SET: { + struct audit_tty_status *s; + struct task_struct *tsk; + + if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) + return -EINVAL; + s = data; + if (s->enabled != 0 && s->enabled != 1) + return -EINVAL; + read_lock(&tasklist_lock); + tsk = find_task_by_pid(pid); + if (!tsk) + err = -ESRCH; + else { + spin_lock_irq(&tsk->sighand->siglock); + tsk->signal->audit_tty = s->enabled != 0; + spin_unlock_irq(&tsk->sighand->siglock); + } + read_unlock(&tasklist_lock); + break; + } default: err = -EINVAL; break; @@ -1185,7 +1266,7 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, } /** - * audit_log_n_unstrustedstring - log a string that may contain random characters + * audit_log_n_untrustedstring - log a string that may contain random characters * @ab: audit_buffer * @len: lenth of string (not including trailing null) * @string: string to be logged @@ -1201,25 +1282,24 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, const char *string) { - const unsigned char *p = string; + const unsigned char *p; - while (*p) { + for (p = string; p < (const unsigned char *)string + len && *p; p++) { if (*p == '"' || *p < 0x21 || *p > 0x7f) { audit_log_hex(ab, string, len); return string + len + 1; } - p++; } audit_log_n_string(ab, len, string); return p + 1; } /** - * audit_log_unstrustedstring - log a string that may contain random characters + * audit_log_untrustedstring - log a string that may contain random characters * @ab: audit_buffer * @string: string to be logged * - * Same as audit_log_n_unstrustedstring(), except that strlen is used to + * Same as audit_log_n_untrustedstring(), except that strlen is used to * determine string length. */ const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) diff --git a/kernel/audit.h b/kernel/audit.h index 815d6f5..9587743 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -115,7 +115,6 @@ extern struct sk_buff * audit_make_reply(int pid, int seq, int type, extern void audit_send_reply(int pid, int seq, int type, int done, int multi, void *payload, int size); -extern void audit_log_lost(const char *message); extern void audit_panic(const char *message); struct audit_netlink_list { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e36481e..7ccc3da 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -71,9 +71,6 @@ extern struct list_head audit_filter_list[]; -/* No syscall auditing will take place unless audit_enabled != 0. */ -extern int audit_enabled; - /* AUDIT_NAMES is the number of slots we reserve in the audit_context * for saving names from getname(). */ #define AUDIT_NAMES 20 diff --git a/kernel/exit.c b/kernel/exit.c index 64a5263..5762669 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -965,6 +965,8 @@ fastcall NORET_TYPE void do_exit(long code) if (unlikely(tsk->compat_robust_list)) compat_exit_robust_list(tsk); #endif + if (group_dead) + tty_audit_exit(); if (unlikely(tsk->audit_context)) audit_free(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 344d693..4015912a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -897,6 +898,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts } acct_init_pacct(&sig->pacct); + tty_audit_fork(sig); + return 0; } -- cgit v1.1 From 7d69a1f4a72b18876c99c697692b78339d491568 Mon Sep 17 00:00:00 2001 From: Cedric Le Goater Date: Sun, 15 Jul 2007 23:40:58 -0700 Subject: remove CONFIG_UTS_NS and CONFIG_IPC_NS CONFIG_UTS_NS and CONFIG_IPC_NS have very little value as they only deactivate the unshare of the uts and ipc namespaces and do not improve performance. Signed-off-by: Cedric Le Goater Acked-by: "Serge E. Hallyn" Cc: Eric W. Biederman Cc: Herbert Poetzl Cc: Pavel Emelianov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 4 ++-- kernel/nsproxy.c | 10 ---------- kernel/utsname_sysctl.c | 5 +---- 3 files changed, 3 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 642d427..fa8efd4 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o + hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \ + utsname.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ @@ -48,7 +49,6 @@ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o -obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 9e83b58..e38bed7 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -155,16 +155,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) return 0; -#ifndef CONFIG_IPC_NS - if (unshare_flags & CLONE_NEWIPC) - return -EINVAL; -#endif - -#ifndef CONFIG_UTS_NS - if (unshare_flags & CLONE_NEWUTS) - return -EINVAL; -#endif - if (!capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index f22b9db..c76c064 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -18,10 +18,7 @@ static void *get_uts(ctl_table *table, int write) { char *which = table->data; -#ifdef CONFIG_UTS_NS - struct uts_namespace *uts_ns = current->nsproxy->uts_ns; - which = (which - (char *)&init_uts_ns) + (char *)uts_ns; -#endif + if (!write) down_read(&uts_sem); else -- cgit v1.1 From acce292c82d4d82d35553b928df2b0597c3a9c78 Mon Sep 17 00:00:00 2001 From: Cedric Le Goater Date: Sun, 15 Jul 2007 23:40:59 -0700 Subject: user namespace: add the framework Basically, it will allow a process to unshare its user_struct table, resetting at the same time its own user_struct and all the associated accounting. A new root user (uid == 0) is added to the user namespace upon creation. Such root users have full privileges and it seems that theses privileges should be controlled through some means (process capabilities ?) The unshare is not included in this patch. Changes since [try #4]: - Updated get_user_ns and put_user_ns to accept NULL, and get_user_ns to return the namespace. Changes since [try #3]: - moved struct user_namespace to files user_namespace.{c,h} Changes since [try #2]: - removed struct user_namespace* argument from find_user() Changes since [try #1]: - removed struct user_namespace* argument from find_user() - added a root_user per user namespace Signed-off-by: Cedric Le Goater Signed-off-by: Serge E. Hallyn Acked-by: Pavel Emelianov Cc: Herbert Poetzl Cc: Kirill Korotaev Cc: Eric W. Biederman Cc: Chris Wright Cc: Stephen Smalley Cc: James Morris Cc: Andrew Morgan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- kernel/fork.c | 2 +- kernel/nsproxy.c | 9 +++++++++ kernel/sys.c | 5 +++-- kernel/user.c | 18 +++++++++--------- kernel/user_namespace.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 66 insertions(+), 13 deletions(-) create mode 100644 kernel/user_namespace.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index fa8efd4..2a99983 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -4,7 +4,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ exit.o itimer.o time.o softirq.o resource.o \ - sysctl.o capability.o ptrace.o timer.o user.o \ + sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ diff --git a/kernel/fork.c b/kernel/fork.c index 4015912a..13cf097 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1002,7 +1002,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (atomic_read(&p->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && - p->user != &root_user) + p->user != current->nsproxy->user_ns->root_user) goto bad_fork_free; } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index e38bed7..895e3a3 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -79,8 +79,15 @@ static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk, if (IS_ERR(new_nsp->pid_ns)) goto out_pid; + new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns); + if (IS_ERR(new_nsp->user_ns)) + goto out_user; + return new_nsp; +out_user: + if (new_nsp->pid_ns) + put_pid_ns(new_nsp->pid_ns); out_pid: if (new_nsp->ipc_ns) put_ipc_ns(new_nsp->ipc_ns); @@ -140,6 +147,8 @@ void free_nsproxy(struct nsproxy *ns) put_ipc_ns(ns->ipc_ns); if (ns->pid_ns) put_pid_ns(ns->pid_ns); + if (ns->user_ns) + put_user_ns(ns->user_ns); kfree(ns); } diff --git a/kernel/sys.c b/kernel/sys.c index 872271c..ed92e2f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -1078,13 +1079,13 @@ static int set_user(uid_t new_ruid, int dumpclear) { struct user_struct *new_user; - new_user = alloc_uid(new_ruid); + new_user = alloc_uid(current->nsproxy->user_ns, new_ruid); if (!new_user) return -EAGAIN; if (atomic_read(&new_user->processes) >= current->signal->rlim[RLIMIT_NPROC].rlim_cur && - new_user != &root_user) { + new_user != current->nsproxy->user_ns->root_user) { free_uid(new_user); return -EAGAIN; } diff --git a/kernel/user.c b/kernel/user.c index 4869563..98b8250 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -14,20 +14,19 @@ #include #include #include +#include +#include /* * UID task count cache, to get fast user lookup in "alloc_uid" * when changing user ID's (ie setuid() and friends). */ -#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8) -#define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) -#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) +#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid))) static struct kmem_cache *uid_cachep; -static struct list_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is @@ -94,9 +93,10 @@ struct user_struct *find_user(uid_t uid) { struct user_struct *ret; unsigned long flags; + struct user_namespace *ns = current->nsproxy->user_ns; spin_lock_irqsave(&uidhash_lock, flags); - ret = uid_hash_find(uid, uidhashentry(uid)); + ret = uid_hash_find(uid, uidhashentry(ns, uid)); spin_unlock_irqrestore(&uidhash_lock, flags); return ret; } @@ -120,9 +120,9 @@ void free_uid(struct user_struct *up) } } -struct user_struct * alloc_uid(uid_t uid) +struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) { - struct list_head *hashent = uidhashentry(uid); + struct list_head *hashent = uidhashentry(ns, uid); struct user_struct *up; spin_lock_irq(&uidhash_lock); @@ -211,11 +211,11 @@ static int __init uid_cache_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); for(n = 0; n < UIDHASH_SZ; ++n) - INIT_LIST_HEAD(uidhash_table + n); + INIT_LIST_HEAD(init_user_ns.uidhash_table + n); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); - uid_hash_insert(&root_user, uidhashentry(0)); + uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0)); spin_unlock_irq(&uidhash_lock); return 0; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c new file mode 100644 index 0000000..3d79642 --- /dev/null +++ b/kernel/user_namespace.c @@ -0,0 +1,43 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include +#include +#include +#include + +struct user_namespace init_user_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, + .root_user = &root_user, +}; + +EXPORT_SYMBOL_GPL(init_user_ns); + +#ifdef CONFIG_USER_NS + +struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns) +{ + struct user_namespace *new_ns; + + BUG_ON(!old_ns); + get_user_ns(old_ns); + + new_ns = old_ns; + return new_ns; +} + +void free_user_ns(struct kref *kref) +{ + struct user_namespace *ns; + + ns = container_of(kref, struct user_namespace, kref); + kfree(ns); +} + +#endif /* CONFIG_USER_NS */ -- cgit v1.1 From 77ec739d8d0979477fc91f530403805afa2581a4 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Sun, 15 Jul 2007 23:41:01 -0700 Subject: user namespace: add unshare This patch enables the unshare of user namespaces. It adds a new clone flag CLONE_NEWUSER and implements copy_user_ns() which resets the current user_struct and adds a new root user (uid == 0) For now, unsharing the user namespace allows a process to reset its user_struct accounting and uid 0 in the new user namespace should be contained using appropriate means, for instance selinux The plan, when the full support is complete (all uid checks covered), is to keep the original user's rights in the original namespace, and let a process become uid 0 in the new namespace, with full capabilities to the new namespace. Signed-off-by: Serge E. Hallyn Signed-off-by: Cedric Le Goater Acked-by: Pavel Emelianov Cc: Herbert Poetzl Cc: Kirill Korotaev Cc: Eric W. Biederman Cc: Chris Wright Cc: Stephen Smalley Cc: James Morris Cc: Andrew Morgan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- kernel/nsproxy.c | 5 +++-- kernel/user_namespace.c | 46 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 49 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 13cf097..7c5c588 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1606,7 +1606,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) err = -EINVAL; if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC)) + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER)) goto bad_unshare_out; if ((err = unshare_thread(unshare_flags))) diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 895e3a3..5aa28e2 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -117,7 +117,7 @@ int copy_namespaces(int flags, struct task_struct *tsk) get_nsproxy(old_ns); - if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER))) return 0; if (!capable(CAP_SYS_ADMIN)) { @@ -161,7 +161,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, { int err = 0; - if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) + if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWUSER))) return 0; if (!capable(CAP_SYS_ADMIN)) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 3d79642..89a27e8 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -21,6 +21,45 @@ EXPORT_SYMBOL_GPL(init_user_ns); #ifdef CONFIG_USER_NS +/* + * Clone a new ns copying an original user ns, setting refcount to 1 + * @old_ns: namespace to clone + * Return NULL on error (failure to kmalloc), new ns otherwise + */ +static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) +{ + struct user_namespace *ns; + struct user_struct *new_user; + int n; + + ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); + if (!ns) + return NULL; + + kref_init(&ns->kref); + + for (n = 0; n < UIDHASH_SZ; ++n) + INIT_LIST_HEAD(ns->uidhash_table + n); + + /* Insert new root user. */ + ns->root_user = alloc_uid(ns, 0); + if (!ns->root_user) { + kfree(ns); + return NULL; + } + + /* Reset current->user with a new one */ + new_user = alloc_uid(ns, current->uid); + if (!new_user) { + free_uid(ns->root_user); + kfree(ns); + return NULL; + } + + switch_uid(new_user); + return ns; +} + struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns) { struct user_namespace *new_ns; @@ -28,7 +67,12 @@ struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns) BUG_ON(!old_ns); get_user_ns(old_ns); - new_ns = old_ns; + if (!(flags & CLONE_NEWUSER)) + return old_ns; + + new_ns = clone_user_ns(old_ns); + + put_user_ns(old_ns); return new_ns; } -- cgit v1.1 From 467e9f4b5086a60a5cb2e032ccaf4a31abadc4c2 Mon Sep 17 00:00:00 2001 From: Cedric Le Goater Date: Sun, 15 Jul 2007 23:41:06 -0700 Subject: fix create_new_namespaces() return value dup_mnt_ns() and clone_uts_ns() return NULL on failure. This is wrong, create_new_namespaces() uses ERR_PTR() to catch an error. This means that the subsequent create_new_namespaces() will hit BUG_ON() in copy_mnt_ns() or copy_utsname(). Modify create_new_namespaces() to also use the errors returned by the copy_*_ns routines and not to systematically return ENOMEM. [oleg@tv-sign.ru: better changelog] Signed-off-by: Cedric Le Goater Cc: Serge E. Hallyn Cc: Badari Pulavarty Cc: Pavel Emelianov Cc: Herbert Poetzl Cc: Eric W. Biederman Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/nsproxy.c | 23 +++++++++++++++++------ kernel/user_namespace.c | 6 +++--- kernel/utsname.c | 10 ++++++---- 3 files changed, 26 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5aa28e2..15a6015 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -58,30 +58,41 @@ static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk, struct fs_struct *new_fs) { struct nsproxy *new_nsp; + int err; new_nsp = clone_nsproxy(tsk->nsproxy); if (!new_nsp) return ERR_PTR(-ENOMEM); new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); - if (IS_ERR(new_nsp->mnt_ns)) + if (IS_ERR(new_nsp->mnt_ns)) { + err = PTR_ERR(new_nsp->mnt_ns); goto out_ns; + } new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); - if (IS_ERR(new_nsp->uts_ns)) + if (IS_ERR(new_nsp->uts_ns)) { + err = PTR_ERR(new_nsp->uts_ns); goto out_uts; + } new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); - if (IS_ERR(new_nsp->ipc_ns)) + if (IS_ERR(new_nsp->ipc_ns)) { + err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; + } new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); - if (IS_ERR(new_nsp->pid_ns)) + if (IS_ERR(new_nsp->pid_ns)) { + err = PTR_ERR(new_nsp->pid_ns); goto out_pid; + } new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns); - if (IS_ERR(new_nsp->user_ns)) + if (IS_ERR(new_nsp->user_ns)) { + err = PTR_ERR(new_nsp->user_ns); goto out_user; + } return new_nsp; @@ -99,7 +110,7 @@ out_uts: put_mnt_ns(new_nsp->mnt_ns); out_ns: kfree(new_nsp); - return ERR_PTR(-ENOMEM); + return ERR_PTR(err); } /* diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 89a27e8..d055d98 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -34,7 +34,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); if (!ns) - return NULL; + return ERR_PTR(-ENOMEM); kref_init(&ns->kref); @@ -45,7 +45,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) ns->root_user = alloc_uid(ns, 0); if (!ns->root_user) { kfree(ns); - return NULL; + return ERR_PTR(-ENOMEM); } /* Reset current->user with a new one */ @@ -53,7 +53,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) if (!new_user) { free_uid(ns->root_user); kfree(ns); - return NULL; + return ERR_PTR(-ENOMEM); } switch_uid(new_user); diff --git a/kernel/utsname.c b/kernel/utsname.c index 160c8c5..3ae4393 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -13,6 +13,7 @@ #include #include #include +#include /* * Clone a new ns copying an original utsname, setting refcount to 1 @@ -24,10 +25,11 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) struct uts_namespace *ns; ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); - if (ns) { - memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); - kref_init(&ns->kref); - } + if (!ns) + return ERR_PTR(-ENOMEM); + + memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + kref_init(&ns->kref); return ns; } -- cgit v1.1 From 98c0d07cbf2a8582a0341b05ad564247e608f6f9 Mon Sep 17 00:00:00 2001 From: Cedric Le Goater Date: Sun, 15 Jul 2007 23:41:07 -0700 Subject: add a kmem_cache for nsproxy objects It should improve performance in some scenarii where a lot of these nsproxy objects are created by unsharing namespaces. This is a typical use of virtual servers that are being created or entered. This is also a good tool to find leaks and gather statistics on namespace usage. Signed-off-by: Cedric Le Goater Cc: Herbert Poetzl Cc: Pavel Emelianov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/nsproxy.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 15a6015..4b7fcc8 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -21,6 +21,8 @@ #include #include +static struct kmem_cache *nsproxy_cachep; + struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); static inline void get_nsproxy(struct nsproxy *ns) @@ -43,9 +45,11 @@ static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig) { struct nsproxy *ns; - ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); - if (ns) + ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL); + if (ns) { + memcpy(ns, orig, sizeof(struct nsproxy)); atomic_set(&ns->count, 1); + } return ns; } @@ -109,7 +113,7 @@ out_uts: if (new_nsp->mnt_ns) put_mnt_ns(new_nsp->mnt_ns); out_ns: - kfree(new_nsp); + kmem_cache_free(nsproxy_cachep, new_nsp); return ERR_PTR(err); } @@ -160,7 +164,7 @@ void free_nsproxy(struct nsproxy *ns) put_pid_ns(ns->pid_ns); if (ns->user_ns) put_user_ns(ns->user_ns); - kfree(ns); + kmem_cache_free(nsproxy_cachep, ns); } /* @@ -185,3 +189,12 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, err = PTR_ERR(*new_nsp); return err; } + +static int __init nsproxy_cache_init(void) +{ + nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy), + 0, SLAB_PANIC, NULL, NULL); + return 0; +} + +module_init(nsproxy_cache_init); -- cgit v1.1 From 6d9525b52aecd11b14c4ec982add01c11157172f Mon Sep 17 00:00:00 2001 From: Henrik Kretzschmar Date: Sun, 15 Jul 2007 23:41:10 -0700 Subject: kerneldoc fix in audit_core_dumps Fix parameter name in audit_core_dumps for kerneldoc. Signed-off-by: Henrik Kretzschmar Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 7ccc3da..b7640a5 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2037,7 +2037,7 @@ int __audit_signal_info(int sig, struct task_struct *t) /** * audit_core_dumps - record information about processes that end abnormally - * @sig: signal value + * @signr: signal value * * If a process ends with a core dump, something fishy is going on and we * should record the event for investigation. -- cgit v1.1 From b716395e2b8e450e294537de0c91476ded2f0395 Mon Sep 17 00:00:00 2001 From: Vasily Tarasov Date: Sun, 15 Jul 2007 23:41:12 -0700 Subject: diskquota: 32bit quota tools on 64bit architectures OpenVZ Linux kernel team has discovered the problem with 32bit quota tools working on 64bit architectures. In 2.6.10 kernel sys32_quotactl() function was replaced by sys_quotactl() with the comment "sys_quotactl seems to be 32/64bit clean, enable it for 32bit" However this isn't right. Look at if_dqblk structure: struct if_dqblk { __u64 dqb_bhardlimit; __u64 dqb_bsoftlimit; __u64 dqb_curspace; __u64 dqb_ihardlimit; __u64 dqb_isoftlimit; __u64 dqb_curinodes; __u64 dqb_btime; __u64 dqb_itime; __u32 dqb_valid; }; For 32 bit quota tools sizeof(if_dqblk) == 0x44. But for 64 bit kernel its size is 0x48, 'cause of alignment! Thus we got a problem. Attached patch reintroduce sys32_quotactl() function, that handles this and related situations. [michal.k.k.piotrowski@gmail.com: build fix] [akpm@linux-foundation.org: Make it link with CONFIG_QUOTA=n] Signed-off-by: Vasily Tarasov Cc: Andi Kleen Cc: "Luck, Tony" Cc: Jan Kara Cc: Signed-off-by: Michal Piotrowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7e11e2c..b0ec498 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -14,6 +14,7 @@ asmlinkage long sys_ni_syscall(void) cond_syscall(sys_nfsservctl); cond_syscall(sys_quotactl); +cond_syscall(sys32_quotactl); cond_syscall(sys_acct); cond_syscall(sys_lookup_dcookie); cond_syscall(sys_swapon); -- cgit v1.1 From 213dd266d48af90c1eec8688c1ff31aa34d21de2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 15 Jul 2007 23:41:15 -0700 Subject: namespace: ensure clone_flags are always stored in an unsigned long While working on unshare support for the network namespace I noticed we were putting clone flags in an int. Which is weird because the syscall uses unsigned long and we at least need an unsigned to properly hold all of the unshare flags. So to make the code consistent, this patch updates the code to use unsigned long instead of int for the clone flags in those places where we get it wrong today. Signed-off-by: Eric W. Biederman Acked-by: Cedric Le Goater Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/nsproxy.c | 6 +++--- kernel/pid.c | 2 +- kernel/utsname.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 4b7fcc8..10f0bbb 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -58,8 +58,8 @@ static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig) * Return the newly created nsproxy. Do not attach this to the task, * leave it to the caller to do proper locking and attach it to task. */ -static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk, - struct fs_struct *new_fs) +static struct nsproxy *create_new_namespaces(unsigned long flags, + struct task_struct *tsk, struct fs_struct *new_fs) { struct nsproxy *new_nsp; int err; @@ -121,7 +121,7 @@ out_ns: * called from clone. This now handles copy for nsproxy and all * namespaces therein. */ -int copy_namespaces(int flags, struct task_struct *tsk) +int copy_namespaces(unsigned long flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; struct nsproxy *new_ns; diff --git a/kernel/pid.c b/kernel/pid.c index eb66bd2..c6e3f9f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -365,7 +365,7 @@ struct pid *find_ge_pid(int nr) } EXPORT_SYMBOL_GPL(find_get_pid); -struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns) +struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) { BUG_ON(!old_ns); get_pid_ns(old_ns); diff --git a/kernel/utsname.c b/kernel/utsname.c index 3ae4393..9d8180a 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -39,7 +39,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns) +struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) { struct uts_namespace *new_ns; -- cgit v1.1 From 4e44f3497d41db4c3b9051c61410dee8ae4fb49c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 15 Jul 2007 23:41:18 -0700 Subject: sys_time() speedup Improve performance of sys_time(). sys_time() returns time in seconds, but it does so by calling do_gettimeofday() and then returning the tv_sec portion of the GTOD time. But the data structure "xtime", which is updated by every timer/scheduler tick, already offers HZ granularity time. The patch improves the sysbench OLTP macrobenchmark significantly: 2.6.22-rc6: #threads 1: transactions: 3733 (373.21 per sec.) 2: transactions: 6676 (667.46 per sec.) 3: transactions: 6957 (695.50 per sec.) 4: transactions: 7055 (705.48 per sec.) 5: transactions: 6596 (659.33 per sec.) 2.6.22-rc6 + sys_time.patch: 1: transactions: 4005 (400.47 per sec.) 2: transactions: 7379 (737.77 per sec.) 3: transactions: 7347 (734.49 per sec.) 4: transactions: 7468 (746.65 per sec.) 5: transactions: 7428 (742.47 per sec.) Mixed API uses of gettimeofday() and time() are guaranteed to be coherent via the use of a at-most-once-per-second slowpath that updates xtime. [akpm@linux-foundation.org: build fixes] Signed-off-by: Ingo Molnar Cc: John Stultz Cc: Thomas Gleixner Cc: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index f04791f..ffe1914 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -57,14 +57,17 @@ EXPORT_SYMBOL(sys_tz); */ asmlinkage long sys_time(time_t __user * tloc) { - time_t i; - struct timeval tv; + /* + * We read xtime.tv_sec atomically - it's updated + * atomically by update_wall_time(), so no need to + * even read-lock the xtime seqlock: + */ + time_t i = xtime.tv_sec; - do_gettimeofday(&tv); - i = tv.tv_sec; + smp_rmb(); /* sys_time() results are coherent */ if (tloc) { - if (put_user(i,tloc)) + if (put_user(i, tloc)) i = -EFAULT; } return i; @@ -373,12 +376,25 @@ void do_gettimeofday (struct timeval *tv) tv->tv_sec = sec; tv->tv_usec = usec; -} + /* + * Make sure xtime.tv_sec [returned by sys_time()] always + * follows the gettimeofday() result precisely. This + * condition is extremely unlikely, it can hit at most + * once per second: + */ + if (unlikely(xtime.tv_sec != tv->tv_sec)) { + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + update_wall_time(); + write_sequnlock_irqrestore(&xtime_lock, flags); + } +} EXPORT_SYMBOL(do_gettimeofday); +#else /* CONFIG_TIME_INTERPOLATION */ -#else #ifndef CONFIG_GENERIC_TIME /* * Simulate gettimeofday using do_gettimeofday which only allows a timeval @@ -394,7 +410,7 @@ void getnstimeofday(struct timespec *tv) } EXPORT_SYMBOL_GPL(getnstimeofday); #endif -#endif +#endif /* CONFIG_TIME_INTERPOLATION */ /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 -- cgit v1.1 From 36cf3b5c3b7228bcf5124c530d50080b61a59f69 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Jul 2007 23:41:20 -0700 Subject: FUTEX: Tidy up the code The recent PRIVATE and REQUEUE_PI changes to the futex code made it hard to read. Tidy it up. Signed-off-by: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 138 +++++++++++++++++++++++------------------------- kernel/rtmutex-debug.c | 6 --- kernel/rtmutex.c | 6 --- kernel/rtmutex_common.h | 9 +++- 4 files changed, 74 insertions(+), 85 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 45490be..5c3f45d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -121,6 +121,24 @@ static struct futex_hash_bucket futex_queues[1<mmap_sem, when futex is shared + */ +static inline void futex_lock_mm(struct rw_semaphore *fshared) +{ + if (fshared) + down_read(fshared); +} + +/* + * Release mm->mmap_sem, when the futex is shared + */ +static inline void futex_unlock_mm(struct rw_semaphore *fshared) +{ + if (fshared) + up_read(fshared); +} + +/* * We hash on the keys returned from get_futex_key (see below). */ static struct futex_hash_bucket *hash_futex(union futex_key *key) @@ -287,7 +305,18 @@ void drop_futex_key_refs(union futex_key *key) } EXPORT_SYMBOL_GPL(drop_futex_key_refs); -static inline int get_futex_value_locked(u32 *dest, u32 __user *from) +static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) +{ + u32 curval; + + pagefault_disable(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + pagefault_enable(); + + return curval; +} + +static int get_futex_value_locked(u32 *dest, u32 __user *from) { int ret; @@ -620,9 +649,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) newval = FUTEX_WAITERS | new_owner->pid; - pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - pagefault_enable(); + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); if (curval == -EFAULT) ret = -EFAULT; @@ -659,9 +686,7 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) * There is no waiter, so we unlock the futex. The owner died * bit has not to be preserved here. We are the owner: */ - pagefault_disable(); - oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); - pagefault_enable(); + oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); if (oldval == -EFAULT) return oldval; @@ -700,8 +725,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, union futex_key key; int ret; - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) @@ -725,8 +749,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, spin_unlock(&hb->lock); out: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret; } @@ -746,8 +769,7 @@ futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, int ret, op_ret, attempt = 0; retryfull: - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) @@ -793,7 +815,7 @@ retry: */ if (attempt++) { ret = futex_handle_fault((unsigned long)uaddr2, - fshared, attempt); + fshared, attempt); if (ret) goto out; goto retry; @@ -803,8 +825,7 @@ retry: * If we would have faulted, release mmap_sem, * fault it in and start all over again. */ - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); ret = get_user(dummy, uaddr2); if (ret) @@ -841,8 +862,8 @@ retry: if (hb1 != hb2) spin_unlock(&hb2->lock); out: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); + return ret; } @@ -861,8 +882,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, int ret, drop_count = 0; retry: - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) @@ -890,8 +910,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, * If we would have faulted, release mmap_sem, fault * it in and start all over again. */ - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); ret = get_user(curval, uaddr1); @@ -944,8 +963,7 @@ out_unlock: drop_futex_key_refs(&key1); out: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret; } @@ -1113,10 +1131,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, while (!ret) { newval = (uval & FUTEX_OWNER_DIED) | newtid; - pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, - uval, newval); - pagefault_enable(); + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); if (curval == -EFAULT) ret = -EFAULT; @@ -1134,6 +1149,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, #define ARG3_SHARED 1 static long futex_wait_restart(struct restart_block *restart); + static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, u32 val, ktime_t *abs_time) { @@ -1148,8 +1164,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, q.pi_state = NULL; retry: - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) @@ -1186,8 +1201,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, * If we would have faulted, release mmap_sem, fault it in and * start all over again. */ - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); ret = get_user(uval, uaddr); @@ -1206,8 +1220,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. */ - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); /* * There might have been scheduling since the queue_me(), as we @@ -1285,8 +1298,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, queue_unlock(&q, hb); out_release_sem: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret; } @@ -1333,8 +1345,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, q.pi_state = NULL; retry: - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) @@ -1353,9 +1364,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, */ newval = current->pid; - pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); - pagefault_enable(); + curval = cmpxchg_futex_value_locked(uaddr, 0, newval); if (unlikely(curval == -EFAULT)) goto uaddr_faulted; @@ -1398,9 +1407,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, lock_taken = 1; } - pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - pagefault_enable(); + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); if (unlikely(curval == -EFAULT)) goto uaddr_faulted; @@ -1428,8 +1435,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * exit to complete. */ queue_unlock(&q, hb); - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); cond_resched(); goto retry; @@ -1465,8 +1471,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. */ - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); WARN_ON(!q.pi_state); /* @@ -1480,8 +1485,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, ret = ret ? 0 : -EWOULDBLOCK; } - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); spin_lock(q.lock_ptr); if (!ret) { @@ -1518,8 +1522,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, /* Unqueue and drop the lock */ unqueue_me_pi(&q); - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret != -EINTR ? ret : -ERESTARTNOINTR; @@ -1527,8 +1530,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, queue_unlock(&q, hb); out_release_sem: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret; uaddr_faulted: @@ -1550,8 +1552,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, goto retry_unlocked; } - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) @@ -1585,8 +1586,7 @@ retry: /* * First take all the futex related locks: */ - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) @@ -1601,11 +1601,9 @@ retry_unlocked: * again. If it succeeds then we can return without waking * anyone else up: */ - if (!(uval & FUTEX_OWNER_DIED)) { - pagefault_disable(); - uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); - pagefault_enable(); - } + if (!(uval & FUTEX_OWNER_DIED)) + uval = cmpxchg_futex_value_locked(uaddr, current->pid, 0); + if (unlikely(uval == -EFAULT)) goto pi_faulted; @@ -1647,8 +1645,7 @@ retry_unlocked: out_unlock: spin_unlock(&hb->lock); out: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret; @@ -1671,8 +1668,7 @@ pi_faulted: goto retry_unlocked; } - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) @@ -1729,8 +1725,8 @@ static int futex_fd(u32 __user *uaddr, int signal) if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " - "will be removed from the kernel in June 2007\n", - current->comm); + "will be removed from the kernel in June 2007\n", + current->comm); } ret = -EINVAL; @@ -1908,10 +1904,8 @@ retry: * Wake robust non-PI futexes here. The wakeup of * PI futexes happens in exit_pi_state(): */ - if (!pi) { - if (uval & FUTEX_WAITERS) + if (!pi && (uval & FUTEX_WAITERS)) futex_wake(uaddr, &curr->mm->mmap_sem, 1); - } } return 0; } diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index da8d6bf..5aedbee 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -29,12 +29,6 @@ #include "rtmutex_common.h" -#ifdef CONFIG_DEBUG_RT_MUTEXES -# include "rtmutex-debug.h" -#else -# include "rtmutex.h" -#endif - # define TRACE_WARN_ON(x) WARN_ON(x) # define TRACE_BUG_ON(x) BUG_ON(x) diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 17d28ce..8cd9bd2 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -17,12 +17,6 @@ #include "rtmutex_common.h" -#ifdef CONFIG_DEBUG_RT_MUTEXES -# include "rtmutex-debug.h" -#else -# include "rtmutex.h" -#endif - /* * lock->owner state tracking: * diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 9c75856..2d3b835 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -103,7 +103,7 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) { - return (struct task_struct *) + return (struct task_struct *) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); } @@ -120,4 +120,11 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif + #endif -- cgit v1.1 From 2be7fe075af8f8ba9c8b8ab0feec4ba4fff04979 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 15 Jul 2007 23:41:21 -0700 Subject: sysctl.c: add text telling people to use CTL_UNNUMBERED Hopefully this will help people to understand the new regime. Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2cce228..7dca326 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -203,7 +203,10 @@ static ctl_table root_table[] = { .mode = 0555, .child = dev_table, }, - +/* + * NOTE: do not add new entries to this table unless you have read + * Documentation/sysctl/ctl_unnumbered.txt + */ { .ctl_name = 0 } }; @@ -992,6 +995,14 @@ static ctl_table vm_table[] = { .extra1 = &zero, }, #endif +/* + * NOTE: do not add new entries to this table unless you have read + * Documentation/sysctl/ctl_unnumbered.txt + */ +/* + * NOTE: do not add new entries to this table unless you have read + * Documentation/sysctl/ctl_unnumbered.txt + */ { .ctl_name = 0 } }; @@ -1132,6 +1143,14 @@ static ctl_table fs_table[] = { .child = binfmt_misc_table, }, #endif +/* + * NOTE: do not add new entries to this table unless you have read + * Documentation/sysctl/ctl_unnumbered.txt + */ +/* + * NOTE: do not add new entries to this table unless you have read + * Documentation/sysctl/ctl_unnumbered.txt + */ { .ctl_name = 0 } }; -- cgit v1.1 From 19769b762607fea53c005a0068cf8939cecd2f9a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 15 Jul 2007 23:41:24 -0700 Subject: sprint_symbol() cleanup Remove pointless `else'. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fed5441..0d66247 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -317,13 +317,12 @@ int sprint_symbol(char *buffer, unsigned long address) name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); if (!name) return sprintf(buffer, "0x%lx", address); - else { - if (modname) - return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, + + if (modname) + return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, size, modname); - else - return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); - } + else + return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); } /* Look up a kernel symbol and print it to the kernel messages. */ -- cgit v1.1 From 1d9d02feeee89e9132034d504c9a45eeaf618a3d Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Sun, 15 Jul 2007 23:41:32 -0700 Subject: move seccomp from /proc to a prctl This reduces the memory footprint and it enforces that only the current task can enable seccomp on itself (this is a requirement for a strightforward [modulo preempt ;) ] TIF_NOTSC implementation). Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/seccomp.c | 26 ++++++++++++++++++++++++++ kernel/sys.c | 8 ++++++++ 2 files changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index c3391b6..1dfa8a5 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -10,6 +10,7 @@ #include /* #define SECCOMP_DEBUG 1 */ +#define NR_SECCOMP_MODES 1 /* * Secure computing mode 1 allows only read/write/exit/sigreturn. @@ -54,3 +55,28 @@ void __secure_computing(int this_syscall) #endif do_exit(SIGKILL); } + +long prctl_get_seccomp(void) +{ + return current->seccomp.mode; +} + +long prctl_set_seccomp(unsigned long seccomp_mode) +{ + long ret; + + /* can set it only once to be even more secure */ + ret = -EPERM; + if (unlikely(current->seccomp.mode)) + goto out; + + ret = -EINVAL; + if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { + current->seccomp.mode = seccomp_mode; + set_thread_flag(TIF_SECCOMP); + ret = 0; + } + + out: + return ret; +} diff --git a/kernel/sys.c b/kernel/sys.c index ed92e2f..4d141ae 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -2242,6 +2243,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = SET_ENDIAN(current, arg2); break; + case PR_GET_SECCOMP: + error = prctl_get_seccomp(); + break; + case PR_SET_SECCOMP: + error = prctl_set_seccomp(arg2); + break; + default: error = -EINVAL; break; -- cgit v1.1 From cf99abace7e07dd8491e7093a9a9ef11d48838ed Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Sun, 15 Jul 2007 23:41:33 -0700 Subject: make seccomp zerocost in schedule This follows a suggestion from Chuck Ebbert on how to make seccomp absolutely zerocost in schedule too. The only remaining footprint of seccomp is in terms of the bzImage size that becomes a few bytes (perhaps even a few kbytes) larger, measure it if you care in the embedded. Signed-off-by: Andrea Arcangeli Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/seccomp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 1dfa8a5..ad64fcb 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -74,6 +74,9 @@ long prctl_set_seccomp(unsigned long seccomp_mode) if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { current->seccomp.mode = seccomp_mode; set_thread_flag(TIF_SECCOMP); +#ifdef TIF_NOTSC + disable_TSC(); +#endif ret = 0; } -- cgit v1.1 From f84d5a76c50d9752cdec64a6e536ee3901b267f6 Mon Sep 17 00:00:00 2001 From: vignesh babu Date: Sun, 15 Jul 2007 23:41:34 -0700 Subject: is_power_of_2: kernel/kfifo.c Replace (n & (n-1)) with is_power_of_2() Signed-off-by: vignesh babu Acked-by: Stelian Pop Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index cee4191..bc41ad0 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -24,6 +24,7 @@ #include #include #include +#include /** * kfifo_init - allocates a new FIFO using a preallocated buffer @@ -41,7 +42,7 @@ struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, struct kfifo *fifo; /* size must be a power of 2 */ - BUG_ON(size & (size - 1)); + BUG_ON(!is_power_of_2(size)); fifo = kmalloc(sizeof(struct kfifo), gfp_mask); if (!fifo) -- cgit v1.1 From f5a421a4509a7e2dff11da0f01b0548f4f84d503 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 15 Jul 2007 23:41:44 -0700 Subject: rename cancel_rearming_delayed_work() to cancel_delayed_work_sync() Imho, the current naming of cancel_xxx workqueue functions is very confusing. cancel_delayed_work() cancel_rearming_delayed_work() cancel_rearming_delayed_workqueue() // obsolete cancel_work_sync() This looks as if the first 2 functions differ in "type" of their argument which is not true any longer, nowadays the difference is the behaviour. The semantics of cancel_rearming_delayed_work(dwork) was changed significantly, it doesn't require that dwork rearms itself, and cancels dwork synchronously. Rename it to cancel_delayed_work_sync(). This matches cancel_delayed_work() and cancel_work_sync(). Re-create cancel_rearming_delayed_work() as a simple inline obsolete wrapper, like cancel_rearming_delayed_workqueue(). Signed-off-by: Oleg Nesterov Acked-by: Jarek Poplawski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3bebf73..ad96568 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -486,13 +486,13 @@ void cancel_work_sync(struct work_struct *work) EXPORT_SYMBOL_GPL(cancel_work_sync); /** - * cancel_rearming_delayed_work - reliably kill off a delayed work. + * cancel_delayed_work_sync - reliably kill off a delayed work. * @dwork: the delayed work struct * * It is possible to use this function if @dwork rearms itself via queue_work() * or queue_delayed_work(). See also the comment for cancel_work_sync(). */ -void cancel_rearming_delayed_work(struct delayed_work *dwork) +void cancel_delayed_work_sync(struct delayed_work *dwork) { while (!del_timer(&dwork->timer) && !try_to_grab_pending(&dwork->work)) @@ -500,7 +500,7 @@ void cancel_rearming_delayed_work(struct delayed_work *dwork) wait_on_work(&dwork->work); work_clear_pending(&dwork->work); } -EXPORT_SYMBOL(cancel_rearming_delayed_work); +EXPORT_SYMBOL(cancel_delayed_work_sync); static struct workqueue_struct *keventd_wq __read_mostly; -- cgit v1.1 From 1f1f642e2f092e37eb9038060eb0100c44f55a11 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 15 Jul 2007 23:41:44 -0700 Subject: make cancel_xxx_work_sync() return a boolean Change cancel_work_sync() and cancel_delayed_work_sync() to return a boolean indicating whether the work was actually cancelled. A zero return value means that the work was not pending/queued. Without that kind of change it is not possible to avoid flush_workqueue() sometimes, see the next patch as an example. Also, this patch unifies both functions and kills the (unlikely) busy-wait loop. Signed-off-by: Oleg Nesterov Acked-by: Jarek Poplawski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ad96568..d7d3fa3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -382,16 +382,16 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) EXPORT_SYMBOL_GPL(flush_workqueue); /* - * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit, + * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, * so this work can't be re-armed in any way. */ static int try_to_grab_pending(struct work_struct *work) { struct cpu_workqueue_struct *cwq; - int ret = 0; + int ret = -1; if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) - return 1; + return 0; /* * The queueing is in progress, or it is already queued. Try to @@ -457,10 +457,28 @@ static void wait_on_work(struct work_struct *work) wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); } +static int __cancel_work_timer(struct work_struct *work, + struct timer_list* timer) +{ + int ret; + + do { + ret = (timer && likely(del_timer(timer))); + if (!ret) + ret = try_to_grab_pending(work); + wait_on_work(work); + } while (unlikely(ret < 0)); + + work_clear_pending(work); + return ret; +} + /** * cancel_work_sync - block until a work_struct's callback has terminated * @work: the work which is to be flushed * + * Returns true if @work was pending. + * * cancel_work_sync() will cancel the work if it is queued. If the work's * callback appears to be running, cancel_work_sync() will block until it * has completed. @@ -476,12 +494,9 @@ static void wait_on_work(struct work_struct *work) * The caller must ensure that workqueue_struct on which this work was last * queued can't be destroyed before this function returns. */ -void cancel_work_sync(struct work_struct *work) +int cancel_work_sync(struct work_struct *work) { - while (!try_to_grab_pending(work)) - cpu_relax(); - wait_on_work(work); - work_clear_pending(work); + return __cancel_work_timer(work, NULL); } EXPORT_SYMBOL_GPL(cancel_work_sync); @@ -489,16 +504,14 @@ EXPORT_SYMBOL_GPL(cancel_work_sync); * cancel_delayed_work_sync - reliably kill off a delayed work. * @dwork: the delayed work struct * + * Returns true if @dwork was pending. + * * It is possible to use this function if @dwork rearms itself via queue_work() * or queue_delayed_work(). See also the comment for cancel_work_sync(). */ -void cancel_delayed_work_sync(struct delayed_work *dwork) +int cancel_delayed_work_sync(struct delayed_work *dwork) { - while (!del_timer(&dwork->timer) && - !try_to_grab_pending(&dwork->work)) - cpu_relax(); - wait_on_work(&dwork->work); - work_clear_pending(&dwork->work); + return __cancel_work_timer(&dwork->work, &dwork->timer); } EXPORT_SYMBOL(cancel_delayed_work_sync); -- cgit v1.1 From 24da1cbff9cfce50868c2dfdcda82a68ac5cb707 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sun, 15 Jul 2007 23:41:46 -0700 Subject: modules: remove modlist_lock Now we always use stop_machine for module insertion or deletion, we no longer need the modlist_lock: merely disabling preemption is sufficient to block against list manipulation. This avoids deadlock on OOPSen where we can potentially grab the lock twice. Bug: 8695 Signed-off-by: Rusty Russell Cc: Ingo Molnar Cc: Tobias Oed Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 7a1a4d3..539fed9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -61,10 +61,8 @@ extern int module_sysfs_initialized; /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) -/* Protects module list */ -static DEFINE_SPINLOCK(modlist_lock); - -/* List of modules, protected by module_mutex AND modlist_lock */ +/* List of modules, protected by module_mutex or preempt_disable + * (add/delete uses stop_machine). */ static DEFINE_MUTEX(module_mutex); static LIST_HEAD(modules); @@ -760,14 +758,13 @@ static void print_unload_info(struct seq_file *m, struct module *mod) void __symbol_put(const char *symbol) { struct module *owner; - unsigned long flags; const unsigned long *crc; - spin_lock_irqsave(&modlist_lock, flags); + preempt_disable(); if (!__find_symbol(symbol, &owner, &crc, 1)) BUG(); module_put(owner); - spin_unlock_irqrestore(&modlist_lock, flags); + preempt_enable(); } EXPORT_SYMBOL(__symbol_put); @@ -1228,14 +1225,14 @@ static void free_module(struct module *mod) void *__symbol_get(const char *symbol) { struct module *owner; - unsigned long value, flags; + unsigned long value; const unsigned long *crc; - spin_lock_irqsave(&modlist_lock, flags); + preempt_disable(); value = __find_symbol(symbol, &owner, &crc, 1); if (value && !strong_try_module_get(owner)) value = 0; - spin_unlock_irqrestore(&modlist_lock, flags); + preempt_enable(); return (void *)value; } @@ -2308,11 +2305,10 @@ const struct seq_operations modules_op = { /* Given an address, look for it in the module exception tables. */ const struct exception_table_entry *search_module_extables(unsigned long addr) { - unsigned long flags; const struct exception_table_entry *e = NULL; struct module *mod; - spin_lock_irqsave(&modlist_lock, flags); + preempt_disable(); list_for_each_entry(mod, &modules, list) { if (mod->num_exentries == 0) continue; @@ -2323,7 +2319,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) if (e) break; } - spin_unlock_irqrestore(&modlist_lock, flags); + preempt_enable(); /* Now, if we found one, we are running inside it now, hence we cannot unload the module, hence no refcnt needed. */ @@ -2335,25 +2331,24 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) */ int is_module_address(unsigned long addr) { - unsigned long flags; struct module *mod; - spin_lock_irqsave(&modlist_lock, flags); + preempt_disable(); list_for_each_entry(mod, &modules, list) { if (within(addr, mod->module_core, mod->core_size)) { - spin_unlock_irqrestore(&modlist_lock, flags); + preempt_enable(); return 1; } } - spin_unlock_irqrestore(&modlist_lock, flags); + preempt_enable(); return 0; } -/* Is this a valid kernel address? We don't grab the lock: we are oopsing. */ +/* Is this a valid kernel address? */ struct module *__module_text_address(unsigned long addr) { struct module *mod; @@ -2368,11 +2363,10 @@ struct module *__module_text_address(unsigned long addr) struct module *module_text_address(unsigned long addr) { struct module *mod; - unsigned long flags; - spin_lock_irqsave(&modlist_lock, flags); + preempt_disable(); mod = __module_text_address(addr); - spin_unlock_irqrestore(&modlist_lock, flags); + preempt_enable(); return mod; } -- cgit v1.1 From 1492192b4a0bb84dd9b792cc0bd30583220a28a7 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Sun, 15 Jul 2007 23:41:51 -0700 Subject: kernel/printk.c: document possible deadlock against scheduler kernel/printk.c: document possible deadlock against scheduler The printk's comment states that it can be called from every context, which might lead to false illusion that it could be called from everywhere without any restrictions. This is however not true - a call to printk() could deadlock if called from scheduler code (namely from schedule(), wake_up(), etc) on runqueue lock when it tries to wake up klogd. Document this. Signed-off-by: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index fccacf7..051d27e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -486,6 +486,9 @@ static int have_callable_console(void) * @fmt: format string * * This is printk(). It can be called from any context. We want it to work. + * Be aware of the fact that if oops_in_progress is not set, we might try to + * wake klogd up which could deadlock on runqueue lock if printk() is called + * from scheduler code. * * We try to grab the console_sem. If we succeed, it's easy - we log the output and * call the console drivers. If we fail to get the semaphore we place the output -- cgit v1.1 From 7144521f5ac741e9ad3033953b9d9fdede015ee0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 16 Jul 2007 11:50:38 -0700 Subject: Remove duplicate comments from sysctl.c Randy Dunlap noticed that the recent comment clarifications from Andrew had somehow gotten duplicated. Quoth Andrew: "hm, that could have been some late-night reject-fixing." Fix it up. Cc: From: Andrew Morton Cc: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7dca326..2ce7acf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -999,10 +999,6 @@ static ctl_table vm_table[] = { * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt */ -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ { .ctl_name = 0 } }; @@ -1147,10 +1143,6 @@ static ctl_table fs_table[] = { * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt */ -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ { .ctl_name = 0 } }; -- cgit v1.1 From 7713a7d195c2e11d7ba6a973317da0af862d1264 Mon Sep 17 00:00:00 2001 From: David Miller Date: Mon, 16 Jul 2007 17:17:44 -0700 Subject: [HRTIMER] Fix cpu pointer arg to clockevents_notify() All of the clockevent notifiers expect a pointer to an "unsigned int" cpu argument, but hrtimer_cpu_notify() passes in a pointer to a long. [ Discussed with and ok by Thomas Gleixner ] Signed-off-by: David S. Miller Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 23c03f4..72d0342 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1406,7 +1406,7 @@ static void migrate_hrtimers(int cpu) static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { - long cpu = (long)hcpu; + unsigned int cpu = (long)hcpu; switch (action) { -- cgit v1.1 From 396faf0303d273219db5d7eb4a2879ad977ed185 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 17 Jul 2007 04:03:13 -0700 Subject: Allow huge page allocations to use GFP_HIGH_MOVABLE Huge pages are not movable so are not allocated from ZONE_MOVABLE. However, as ZONE_MOVABLE will always have pages that can be migrated or reclaimed, it can be used to satisfy hugepage allocations even when the system has been running a long time. This allows an administrator to resize the hugepage pool at runtime depending on the size of ZONE_MOVABLE. This patch adds a new sysctl called hugepages_treat_as_movable. When a non-zero value is written to it, future allocations for the huge page pool will use ZONE_MOVABLE. Despite huge pages being non-movable, we do not introduce additional external fragmentation of note as huge pages are always the largest contiguous block we care about. [akpm@linux-foundation.org: various fixes] Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2ce7acf..48dae07 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -826,6 +826,14 @@ static ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "hugepages_treat_as_movable", + .data = &hugepages_treat_as_movable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &hugetlb_treat_movable_handler, + }, #endif { .ctl_name = VM_LOWMEM_RESERVE_RATIO, -- cgit v1.1 From 94f6030ca792c57422f04a73e7a872d8325946d3 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 17 Jul 2007 04:03:29 -0700 Subject: Slab allocators: Replace explicit zeroing with __GFP_ZERO kmalloc_node() and kmem_cache_alloc_node() were not available in a zeroing variant in the past. But with __GFP_ZERO it is possible now to do zeroing while allocating. Use __GFP_ZERO to remove the explicit clearing of memory via memset whereever we can. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 1258371..b7792fb 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1221,7 +1221,8 @@ static int __devinit init_timers_cpu(int cpu) /* * The APs use this path later in boot */ - base = kmalloc_node(sizeof(*base), GFP_KERNEL, + base = kmalloc_node(sizeof(*base), + GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); if (!base) return -ENOMEM; @@ -1232,7 +1233,6 @@ static int __devinit init_timers_cpu(int cpu) kfree(base); return -ENOMEM; } - memset(base, 0, sizeof(*base)); per_cpu(tvec_bases, cpu) = base; } else { /* -- cgit v1.1 From 831441862956fffa17b9801db37e6ea1650b0f69 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 17 Jul 2007 04:03:35 -0700 Subject: Freezer: make kernel threads nonfreezable by default Currently, the freezer treats all tasks as freezable, except for the kernel threads that explicitly set the PF_NOFREEZE flag for themselves. This approach is problematic, since it requires every kernel thread to either set PF_NOFREEZE explicitly, or call try_to_freeze(), even if it doesn't care for the freezing of tasks at all. It seems better to only require the kernel threads that want to or need to be frozen to use some freezer-related code and to remove any freezer-related code from the other (nonfreezable) kernel threads, which is done in this patch. The patch causes all kernel threads to be nonfreezable by default (ie. to have PF_NOFREEZE set by default) and introduces the set_freezable() function that should be called by the freezable kernel threads in order to unset PF_NOFREEZE. It also makes all of the currently freezable kernel threads call set_freezable(), so it shouldn't cause any (intentional) change of behaviour to appear. Additionally, it updates documentation to describe the freezing of tasks more accurately. [akpm@linux-foundation.org: build fixes] Signed-off-by: Rafael J. Wysocki Acked-by: Nigel Cunningham Cc: Pavel Machek Cc: Oleg Nesterov Cc: Gautham R Shenoy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 1 + kernel/exit.c | 6 ++++++ kernel/fork.c | 2 +- kernel/rcutorture.c | 4 +--- kernel/rtmutex-tester.c | 1 + kernel/sched.c | 3 --- kernel/softirq.c | 3 +-- kernel/softlockup.c | 2 +- kernel/workqueue.c | 4 ++-- 9 files changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 5ce8851..eb0f916 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -392,6 +392,7 @@ static int kauditd_thread(void *dummy) { struct sk_buff *skb; + set_freezable(); while (!kthread_should_stop()) { skb = skb_dequeue(&audit_skb_queue); wake_up(&audit_backlog_wait); diff --git a/kernel/exit.c b/kernel/exit.c index 5762669..e8af8d0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -387,6 +388,11 @@ void daemonize(const char *name, ...) * they would be locked into memory. */ exit_mm(current); + /* + * We don't want to have TIF_FREEZE set if the system-wide hibernation + * or suspend transition begins right now. + */ + current->flags |= PF_NOFREEZE; set_special_pids(1, 1); proc_clear_tty(current); diff --git a/kernel/fork.c b/kernel/fork.c index 7c5c588..ba39bdb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -923,7 +923,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; - new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE); + new_flags &= ~PF_SUPERPRIV; new_flags |= PF_FORKNOEXEC; if (!(clone_flags & CLONE_PTRACE)) p->ptrace = 0; diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 55ba82a..ddff332 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -518,7 +519,6 @@ rcu_torture_writer(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); set_user_nice(current, 19); - current->flags |= PF_NOFREEZE; do { schedule_timeout_uninterruptible(1); @@ -558,7 +558,6 @@ rcu_torture_fakewriter(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); set_user_nice(current, 19); - current->flags |= PF_NOFREEZE; do { schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); @@ -589,7 +588,6 @@ rcu_torture_reader(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); set_user_nice(current, 19); - current->flags |= PF_NOFREEZE; do { idx = cur_ops->readlock(); diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 015fc63..e3055ba 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -260,6 +260,7 @@ static int test_func(void *data) int ret; current->flags |= PF_MUTEX_TESTER; + set_freezable(); allow_signal(SIGHUP); for(;;) { diff --git a/kernel/sched.c b/kernel/sched.c index 1c80766..cb31fb4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4912,8 +4912,6 @@ static int migration_thread(void *data) struct migration_req *req; struct list_head *head; - try_to_freeze(); - spin_lock_irq(&rq->lock); if (cpu_is_offline(cpu)) { @@ -5147,7 +5145,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); if (IS_ERR(p)) return NOTIFY_BAD; - p->flags |= PF_NOFREEZE; kthread_bind(p, cpu); /* Must be high prio: stop_machine expects to yield to it. */ rq = task_rq_lock(p, &flags); diff --git a/kernel/softirq.c b/kernel/softirq.c index 8de2677..0f546dd 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -488,8 +489,6 @@ void __init softirq_init(void) static int ksoftirqd(void * __bind_cpu) { - current->flags |= PF_NOFREEZE; - set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 0131e29..708d488 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -116,7 +117,6 @@ static int watchdog(void * __bind_cpu) struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; sched_setscheduler(current, SCHED_FIFO, ¶m); - current->flags |= PF_NOFREEZE; /* initialize timestamp */ touch_softlockup_watchdog(); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d7d3fa3..1935302 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -282,8 +282,8 @@ static int worker_thread(void *__cwq) struct cpu_workqueue_struct *cwq = __cwq; DEFINE_WAIT(wait); - if (!cwq->wq->freezeable) - current->flags |= PF_NOFREEZE; + if (cwq->wq->freezeable) + set_freezable(); set_user_nice(current, -5); -- cgit v1.1 From bcdcd8e725b923ad7c0de809680d5d5658a7bf8c Mon Sep 17 00:00:00 2001 From: Pavel Emelianov Date: Tue, 17 Jul 2007 04:03:42 -0700 Subject: Report that kernel is tainted if there was an OOPS If the kernel OOPSed or BUGed then it probably should be considered as tainted. Thus, all subsequent OOPSes and SysRq dumps will report the tainted kernel. This saves a lot of time explaining oddities in the calltraces. Signed-off-by: Pavel Emelianov Acked-by: Randy Dunlap Cc: Signed-off-by: Andrew Morton [ Added parisc patch from Matthew Wilson -Linus ] Signed-off-by: Linus Torvalds --- kernel/panic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 623d182..f64f4c1 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -159,14 +159,15 @@ const char *print_tainted(void) { static char buf[20]; if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c", + snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c", tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', tainted & TAINT_FORCED_MODULE ? 'F' : ' ', tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', tainted & TAINT_BAD_PAGE ? 'B' : ' ', - tainted & TAINT_USER ? 'U' : ' '); + tainted & TAINT_USER ? 'U' : ' ', + tainted & TAINT_DIE ? 'D' : ' '); } else snprintf(buf, sizeof(buf), "Not tainted"); -- cgit v1.1 From 7664732315c97f48dba9d1e7339ad16fc5a320ac Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 17 Jul 2007 04:03:43 -0700 Subject: PTRACE_PEEKDATA consolidation Identical implementations of PTRACE_PEEKDATA go into generic_ptrace_peekdata() function. Signed-off-by: Alexey Dobriyan Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index b1d11f1..1653d35 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -490,3 +490,14 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data) return ret; } #endif /* __ARCH_SYS_PTRACE */ + +int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) +{ + unsigned long tmp; + int copied; + + copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); + if (copied != sizeof(tmp)) + return -EIO; + return put_user(tmp, (unsigned long __user *)data); +} -- cgit v1.1 From f284ce7269031947326bac6bb19a977705276222 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 17 Jul 2007 04:03:44 -0700 Subject: PTRACE_POKEDATA consolidation Identical implementations of PTRACE_POKEDATA go into generic_ptrace_pokedata() function. AFAICS, fix bug on xtensa where successful PTRACE_POKEDATA will nevertheless return EPERM. Signed-off-by: Alexey Dobriyan Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1653d35..4a1745f1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -501,3 +501,11 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) return -EIO; return put_user(tmp, (unsigned long __user *)data); } + +int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) +{ + int copied; + + copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); + return (copied == sizeof(data)) ? 0 : -EIO; +} -- cgit v1.1 From 62239ac2b301abc397e70986649666cfb7835907 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 17 Jul 2007 04:03:45 -0700 Subject: proper prototype for proc_nr_files() Add a proper prototype for proc_nr_files() in include/linux/fs.h Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 48dae07..7063ebc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -49,9 +50,6 @@ #include #include -extern int proc_nr_files(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos); - #ifdef CONFIG_X86 #include #include -- cgit v1.1 From 9281acea6a3687ff0f262e0be31eac34895b95d7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 17 Jul 2007 04:03:51 -0700 Subject: kallsyms: make KSYM_NAME_LEN include space for trailing '\0' KSYM_NAME_LEN is peculiar in that it does not include the space for the trailing '\0', forcing all users to use KSYM_NAME_LEN + 1 when allocating buffer. This is nonsense and error-prone. Moreover, when the caller forgets that it's very likely to subtly bite back by corrupting the stack because the last position of the buffer is always cleared to zero. This patch increments KSYM_NAME_LEN by one and updates code accordingly. * off-by-one bug in asm-powerpc/kprobes.h::kprobe_lookup_name() macro is fixed. * Where MODULE_NAME_LEN and KSYM_NAME_LEN were used together, MODULE_NAME_LEN was treated as if it didn't include space for the trailing '\0'. Fix it. Signed-off-by: Tejun Heo Acked-by: Paulo Marques Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 16 ++++++++-------- kernel/lockdep.c | 4 ++-- kernel/module.c | 10 +++++----- kernel/time/timer_list.c | 2 +- kernel/time/timer_stats.c | 2 +- 5 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 0d66247..474219a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -152,7 +152,7 @@ static unsigned int get_symbol_offset(unsigned long pos) /* Lookup the address for this symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name) { - char namebuf[KSYM_NAME_LEN+1]; + char namebuf[KSYM_NAME_LEN]; unsigned long i; unsigned int off; @@ -248,7 +248,7 @@ const char *kallsyms_lookup(unsigned long addr, { const char *msym; - namebuf[KSYM_NAME_LEN] = 0; + namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; if (is_ksym_addr(addr)) { @@ -265,7 +265,7 @@ const char *kallsyms_lookup(unsigned long addr, /* see if it's in a module */ msym = module_address_lookup(addr, symbolsize, offset, modname); if (msym) - return strncpy(namebuf, msym, KSYM_NAME_LEN); + return strncpy(namebuf, msym, KSYM_NAME_LEN - 1); return NULL; } @@ -273,7 +273,7 @@ const char *kallsyms_lookup(unsigned long addr, int lookup_symbol_name(unsigned long addr, char *symname) { symname[0] = '\0'; - symname[KSYM_NAME_LEN] = '\0'; + symname[KSYM_NAME_LEN - 1] = '\0'; if (is_ksym_addr(addr)) { unsigned long pos; @@ -291,7 +291,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name) { name[0] = '\0'; - name[KSYM_NAME_LEN] = '\0'; + name[KSYM_NAME_LEN - 1] = '\0'; if (is_ksym_addr(addr)) { unsigned long pos; @@ -312,7 +312,7 @@ int sprint_symbol(char *buffer, unsigned long address) char *modname; const char *name; unsigned long offset, size; - char namebuf[KSYM_NAME_LEN+1]; + char namebuf[KSYM_NAME_LEN]; name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); if (!name) @@ -342,8 +342,8 @@ struct kallsym_iter unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols */ char type; - char name[KSYM_NAME_LEN+1]; - char module_name[MODULE_NAME_LEN + 1]; + char name[KSYM_NAME_LEN]; + char module_name[MODULE_NAME_LEN]; int exported; }; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 1a5ff22..edba2ff 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -379,7 +379,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4 static void print_lock_name(struct lock_class *class) { - char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4; + char str[KSYM_NAME_LEN], c1, c2, c3, c4; const char *name; get_usage_chars(class, &c1, &c2, &c3, &c4); @@ -401,7 +401,7 @@ static void print_lock_name(struct lock_class *class) static void print_lockdep_cache(struct lockdep_map *lock) { const char *name; - char str[KSYM_NAME_LEN + 1]; + char str[KSYM_NAME_LEN]; name = lock->name; if (!name) diff --git a/kernel/module.c b/kernel/module.c index 539fed9..33c04ad 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2133,7 +2133,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) sym = get_ksymbol(mod, addr, NULL, NULL); if (!sym) goto out; - strlcpy(symname, sym, KSYM_NAME_LEN + 1); + strlcpy(symname, sym, KSYM_NAME_LEN); mutex_unlock(&module_mutex); return 0; } @@ -2158,9 +2158,9 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, if (!sym) goto out; if (modname) - strlcpy(modname, mod->name, MODULE_NAME_LEN + 1); + strlcpy(modname, mod->name, MODULE_NAME_LEN); if (name) - strlcpy(name, sym, KSYM_NAME_LEN + 1); + strlcpy(name, sym, KSYM_NAME_LEN); mutex_unlock(&module_mutex); return 0; } @@ -2181,8 +2181,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, - KSYM_NAME_LEN + 1); - strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1); + KSYM_NAME_LEN); + strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, mod); mutex_unlock(&module_mutex); return 0; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 8bbcfb7..e5edc3a 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -38,7 +38,7 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); static void print_name_offset(struct seq_file *m, void *sym) { - char symname[KSYM_NAME_LEN+1]; + char symname[KSYM_NAME_LEN]; if (lookup_symbol_name((unsigned long)sym, symname) < 0) SEQ_printf(m, "<%p>", sym); diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 9b8a826..8ed62fd 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -269,7 +269,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, static void print_name_offset(struct seq_file *m, unsigned long addr) { - char symname[KSYM_NAME_LEN+1]; + char symname[KSYM_NAME_LEN]; if (lookup_symbol_name(addr, symname) < 0) seq_printf(m, "<%p>", (void *)addr); -- cgit v1.1 From 13c22168b7276dffe49dc66675d5a78f6d288e0d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 17 Jul 2007 04:03:55 -0700 Subject: destroy_workqueue() can livelock Pointed out by Michal Schmidt . The bug was introduced in 2.6.22 by me. cleanup_workqueue_thread() does flush_cpu_workqueue(cwq) in a loop until ->worklist becomes empty. This is live-lockable, a re-niced caller can get CPU after wake_up() and insert a new barrier before the lower-priority cwq->thread has a chance to clear ->current_work. Change cleanup_workqueue_thread() to do flush_cpu_workqueue(cwq) only once. We can rely on the fact that run_workqueue() won't return until it flushes all works. So it is safe to call kthread_stop() after that, the "should stop" request won't be noticed until run_workqueue() returns. Signed-off-by: Oleg Nesterov Cc: Michal Schmidt Cc: Srivatsa Vaddagiri Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1935302..58e5c15 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -752,18 +752,17 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) if (cwq->thread == NULL) return; + flush_cpu_workqueue(cwq); /* - * If the caller is CPU_DEAD the single flush_cpu_workqueue() - * is not enough, a concurrent flush_workqueue() can insert a - * barrier after us. + * If the caller is CPU_DEAD and cwq->worklist was not empty, + * a concurrent flush_workqueue() can insert a barrier after us. + * However, in that case run_workqueue() won't return and check + * kthread_should_stop() until it flushes all work_struct's. * When ->worklist becomes empty it is safe to exit because no * more work_structs can be queued on this cwq: flush_workqueue * checks list_empty(), and a "normal" queue_work() can't use * a dead CPU. */ - while (flush_cpu_workqueue(cwq)) - ; - kthread_stop(cwq->thread); cwq->thread = NULL; } -- cgit v1.1 From 6f686d3d14621b90f3793b705bdf9fa624fd29ca Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Mon, 16 Jul 2007 21:25:01 -0400 Subject: kernel/auditfilter: kill bogus uninit'd-var compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kill this warning... kernel/auditfilter.c: In function ‘audit_receive_filter’: kernel/auditfilter.c:1213: warning: ‘ndw’ may be used uninitialized in this function kernel/auditfilter.c:1213: warning: ‘ndp’ may be used uninitialized in this function ...with a simplification of the code. audit_put_nd() can accept NULL arguments, just like kfree(). It is cleaner to init two existing vars to NULL, remove the redundant test variable 'putnd_needed' branches, and call audit_put_nd() directly. As a desired side effect, the warning goes away. Signed-off-by: Jeff Garzik --- kernel/auditfilter.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index ce61f42..1bf093d 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1210,8 +1210,8 @@ static inline int audit_add_rule(struct audit_entry *entry, struct audit_entry *e; struct audit_field *inode_f = entry->rule.inode_f; struct audit_watch *watch = entry->rule.watch; - struct nameidata *ndp, *ndw; - int h, err, putnd_needed = 0; + struct nameidata *ndp = NULL, *ndw = NULL; + int h, err; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -1239,7 +1239,6 @@ static inline int audit_add_rule(struct audit_entry *entry, err = audit_get_nd(watch->path, &ndp, &ndw); if (err) goto error; - putnd_needed = 1; } mutex_lock(&audit_filter_mutex); @@ -1269,14 +1268,11 @@ static inline int audit_add_rule(struct audit_entry *entry, #endif mutex_unlock(&audit_filter_mutex); - if (putnd_needed) - audit_put_nd(ndp, ndw); - + audit_put_nd(ndp, ndw); /* NULL args OK */ return 0; error: - if (putnd_needed) - audit_put_nd(ndp, ndw); + audit_put_nd(ndp, ndw); /* NULL args OK */ if (watch) audit_put_watch(watch); /* tmp watch, matches initial get */ return err; -- cgit v1.1 From 0ab4dc92278a0f3816e486d6350c6652a72e06c8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:02 -0700 Subject: usermodehelper: split setup from execution Rather than having hundreds of variations of call_usermodehelper for various pieces of usermode state which could be set up, split the info allocation and initialization from the actual process execution. This means the general pattern becomes: info = call_usermodehelper_setup(path, argv, envp); /* basic state */ call_usermodehelper_(info, stuff...); /* extra state */ call_usermodehelper_exec(info, wait); /* run process and free info */ This patch introduces wrappers for all the existing calling styles for call_usermodehelper_*, but folds their implementations into one. Signed-off-by: Jeremy Fitzhardinge Cc: Andi Kleen Cc: Rusty Russell Cc: David Howells Cc: Bj?rn Steinbrink Cc: Randy Dunlap --- kernel/kmod.c | 191 +++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 135 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 4d32eb0..d2dce71 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -122,6 +122,7 @@ struct subprocess_info { int wait; int retval; struct file *stdin; + void (*cleanup)(char **argv, char **envp); }; /* @@ -180,6 +181,14 @@ static int ____call_usermodehelper(void *data) do_exit(0); } +void call_usermodehelper_freeinfo(struct subprocess_info *info) +{ + if (info->cleanup) + (*info->cleanup)(info->argv, info->envp); + kfree(info); +} +EXPORT_SYMBOL(call_usermodehelper_freeinfo); + /* Keventd can't block, but this (a child) can. */ static int wait_for_helper(void *data) { @@ -217,7 +226,7 @@ static int wait_for_helper(void *data) } if (sub_info->wait < 0) - kfree(sub_info); + call_usermodehelper_freeinfo(sub_info); else complete(sub_info->complete); return 0; @@ -252,11 +261,94 @@ static void __call_usermodehelper(struct work_struct *work) } /** - * call_usermodehelper_keys - start a usermode application - * @path: pathname for the application - * @argv: null-terminated argument list - * @envp: null-terminated environment list - * @session_keyring: session keyring for process (NULL for an empty keyring) + * call_usermodehelper_setup - prepare to call a usermode helper + * @path - path to usermode executable + * @argv - arg vector for process + * @envp - environment for process + * + * Returns either NULL on allocation failure, or a subprocess_info + * structure. This should be passed to call_usermodehelper_exec to + * exec the process and free the structure. + */ +struct subprocess_info *call_usermodehelper_setup(char *path, + char **argv, char **envp) +{ + struct subprocess_info *sub_info; + sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); + if (!sub_info) + goto out; + + INIT_WORK(&sub_info->work, __call_usermodehelper); + sub_info->path = path; + sub_info->argv = argv; + sub_info->envp = envp; + + out: + return sub_info; +} +EXPORT_SYMBOL(call_usermodehelper_setup); + +/** + * call_usermodehelper_setkeys - set the session keys for usermode helper + * @info: a subprocess_info returned by call_usermodehelper_setup + * @session_keyring: the session keyring for the process + */ +void call_usermodehelper_setkeys(struct subprocess_info *info, + struct key *session_keyring) +{ + info->ring = session_keyring; +} +EXPORT_SYMBOL(call_usermodehelper_setkeys); + +/** + * call_usermodehelper_setcleanup - set a cleanup function + * @info: a subprocess_info returned by call_usermodehelper_setup + * @cleanup: a cleanup function + * + * The cleanup function is just befor ethe subprocess_info is about to + * be freed. This can be used for freeing the argv and envp. The + * Function must be runnable in either a process context or the + * context in which call_usermodehelper_exec is called. + */ +void call_usermodehelper_setcleanup(struct subprocess_info *info, + void (*cleanup)(char **argv, char **envp)) +{ + info->cleanup = cleanup; +} +EXPORT_SYMBOL(call_usermodehelper_setcleanup); + +/** + * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin + * @sub_info: a subprocess_info returned by call_usermodehelper_setup + * @filp: set to the write-end of a pipe + * + * This constructs a pipe, and sets the read end to be the stdin of the + * subprocess, and returns the write-end in *@filp. + */ +int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info, + struct file **filp) +{ + struct file *f; + + f = create_write_pipe(); + if (IS_ERR(f)) + return PTR_ERR(f); + *filp = f; + + f = create_read_pipe(f); + if (IS_ERR(f)) { + free_write_pipe(*filp); + return PTR_ERR(f); + } + sub_info->stdin = f; + + return 0; +} +EXPORT_SYMBOL(call_usermodehelper_stdinpipe); + +/** + * call_usermodehelper_exec - start a usermode application + * @sub_info: information about the subprocessa * @wait: wait for the application to finish and return status. * when -1 don't wait at all, but you get no useful error back when * the program couldn't be exec'ed. This makes it safe to call @@ -265,33 +357,24 @@ static void __call_usermodehelper(struct work_struct *work) * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of keventd. * (ie. it runs with full root capabilities). - * - * Must be called from process context. Returns a negative error code - * if program was not execed successfully, or 0. */ -int call_usermodehelper_keys(char *path, char **argv, char **envp, - struct key *session_keyring, int wait) +int call_usermodehelper_exec(struct subprocess_info *sub_info, + int wait) { DECLARE_COMPLETION_ONSTACK(done); - struct subprocess_info *sub_info; int retval; - if (!khelper_wq) - return -EBUSY; - - if (path[0] == '\0') - return 0; + if (sub_info->path[0] == '\0') { + retval = 0; + goto out; + } - sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); - if (!sub_info) - return -ENOMEM; + if (!khelper_wq) { + retval = -EBUSY; + goto out; + } - INIT_WORK(&sub_info->work, __call_usermodehelper); sub_info->complete = &done; - sub_info->path = path; - sub_info->argv = argv; - sub_info->envp = envp; - sub_info->ring = session_keyring; sub_info->wait = wait; queue_work(khelper_wq, &sub_info->work); @@ -299,47 +382,43 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, return 0; wait_for_completion(&done); retval = sub_info->retval; - kfree(sub_info); + + out: + call_usermodehelper_freeinfo(sub_info); return retval; } -EXPORT_SYMBOL(call_usermodehelper_keys); +EXPORT_SYMBOL(call_usermodehelper_exec); +/** + * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin + * @path: path to usermode executable + * @argv: arg vector for process + * @envp: environment for process + * @filp: set to the write-end of a pipe + * + * This is a simple wrapper which executes a usermode-helper function + * with a pipe as stdin. It is implemented entirely in terms of + * lower-level call_usermodehelper_* functions. + */ int call_usermodehelper_pipe(char *path, char **argv, char **envp, struct file **filp) { - DECLARE_COMPLETION(done); - struct subprocess_info sub_info = { - .work = __WORK_INITIALIZER(sub_info.work, - __call_usermodehelper), - .complete = &done, - .path = path, - .argv = argv, - .envp = envp, - .retval = 0, - }; - struct file *f; - - if (!khelper_wq) - return -EBUSY; + struct subprocess_info *sub_info; + int ret; - if (path[0] == '\0') - return 0; + sub_info = call_usermodehelper_setup(path, argv, envp); + if (sub_info == NULL) + return -ENOMEM; - f = create_write_pipe(); - if (IS_ERR(f)) - return PTR_ERR(f); - *filp = f; + ret = call_usermodehelper_stdinpipe(sub_info, filp); + if (ret < 0) + goto out; - f = create_read_pipe(f); - if (IS_ERR(f)) { - free_write_pipe(*filp); - return PTR_ERR(f); - } - sub_info.stdin = f; + return call_usermodehelper_exec(sub_info, 1); - queue_work(khelper_wq, &sub_info.work); - wait_for_completion(&done); - return sub_info.retval; + out: + call_usermodehelper_freeinfo(sub_info); + return ret; } EXPORT_SYMBOL(call_usermodehelper_pipe); -- cgit v1.1 From 10a0a8d4e3f6bf2d077f94344441909abe670f5a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:02 -0700 Subject: Add common orderly_poweroff() Various pieces of code around the kernel want to be able to trigger an orderly poweroff. This pulls them together into a single implementation. By default the poweroff command is /sbin/poweroff, but it can be set via sysctl: kernel/poweroff_cmd. This is split at whitespace, so it can include command-line arguments. This patch replaces four other instances of invoking either "poweroff" or "shutdown -h now": two sbus drivers, and acpi thermal management. sparc64 has its own "powerd"; still need to determine whether it should be replaced by orderly_poweroff(). Signed-off-by: Jeremy Fitzhardinge Acked-by: Len Brown Signed-off-by: Chris Wright Cc: Andrew Morton Cc: Randy Dunlap Cc: Andi Kleen Cc: Al Viro Cc: Arnd Bergmann Cc: David S. Miller --- kernel/sys.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 10 ++++++++++ 2 files changed, 68 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 4d141ae..aeded9a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2286,3 +2286,61 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, } return err ? -EFAULT : 0; } + +char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; + +static void argv_cleanup(char **argv, char **envp) +{ + argv_free(argv); +} + +/** + * orderly_poweroff - Trigger an orderly system poweroff + * @force: force poweroff if command execution fails + * + * This may be called from any context to trigger a system shutdown. + * If the orderly shutdown fails, it will force an immediate shutdown. + */ +int orderly_poweroff(bool force) +{ + int argc; + char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); + static char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL + }; + int ret = -ENOMEM; + struct subprocess_info *info; + + if (argv == NULL) { + printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", + __func__, poweroff_cmd); + goto out; + } + + info = call_usermodehelper_setup(argv[0], argv, envp); + if (info == NULL) { + argv_free(argv); + goto out; + } + + call_usermodehelper_setcleanup(info, argv_cleanup); + + ret = call_usermodehelper_exec(info, -1); + + out: + if (ret && force) { + printk(KERN_WARNING "Failed to start orderly shutdown: " + "forcing the issue\n"); + + /* I guess this should try to kick off some daemon to + sync and poweroff asap. Or not even bother syncing + if we're doing an emergency shutdown? */ + emergency_sync(); + kernel_power_off(); + } + + return ret; +} +EXPORT_SYMBOL_GPL(orderly_poweroff); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7063ebc..44a1d69 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -705,6 +706,15 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif + { + .ctl_name = CTL_UNNUMBERED, + .procname = "poweroff_cmd", + .data = &poweroff_cmd, + .maxlen = POWEROFF_CMD_PATH_LEN, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, { .ctl_name = 0 } }; -- cgit v1.1 From 86313c488a6848b7ec2ba04e74f25f79dd32a0b7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:03 -0700 Subject: usermodehelper: Tidy up waiting Rather than using a tri-state integer for the wait flag in call_usermodehelper_exec, define a proper enum, and use that. I've preserved the integer values so that any callers I've missed should still work OK. Signed-off-by: Jeremy Fitzhardinge Cc: James Bottomley Cc: Randy Dunlap Cc: Christoph Hellwig Cc: Andi Kleen Cc: Paul Mackerras Cc: Johannes Berg Cc: Ralf Baechle Cc: Bjorn Helgaas Cc: Joel Becker Cc: Tony Luck Cc: Kay Sievers Cc: Srivatsa Vaddagiri Cc: Oleg Nesterov Cc: David Howells --- kernel/cpuset.c | 2 +- kernel/kmod.c | 27 ++++++++++++++++----------- kernel/sys.c | 2 +- 3 files changed, 18 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b4796d8..57e6448 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -516,7 +516,7 @@ static void cpuset_release_agent(const char *pathbuf) envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; - call_usermodehelper(argv[0], argv, envp, 0); + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); kfree(pathbuf); } diff --git a/kernel/kmod.c b/kernel/kmod.c index d2dce71..78d365c 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -119,7 +119,7 @@ struct subprocess_info { char **argv; char **envp; struct key *ring; - int wait; + enum umh_wait wait; int retval; struct file *stdin; void (*cleanup)(char **argv, char **envp); @@ -225,7 +225,7 @@ static int wait_for_helper(void *data) sub_info->retval = ret; } - if (sub_info->wait < 0) + if (sub_info->wait == UMH_NO_WAIT) call_usermodehelper_freeinfo(sub_info); else complete(sub_info->complete); @@ -238,26 +238,31 @@ static void __call_usermodehelper(struct work_struct *work) struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); pid_t pid; - int wait = sub_info->wait; + enum umh_wait wait = sub_info->wait; /* CLONE_VFORK: wait until the usermode helper has execve'd * successfully We need the data structures to stay around * until that is done. */ - if (wait) + if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT) pid = kernel_thread(wait_for_helper, sub_info, CLONE_FS | CLONE_FILES | SIGCHLD); else pid = kernel_thread(____call_usermodehelper, sub_info, CLONE_VFORK | SIGCHLD); - if (wait < 0) - return; + switch (wait) { + case UMH_NO_WAIT: + break; - if (pid < 0) { + case UMH_WAIT_PROC: + if (pid > 0) + break; sub_info->retval = pid; + /* FALLTHROUGH */ + + case UMH_WAIT_EXEC: complete(sub_info->complete); - } else if (!wait) - complete(sub_info->complete); + } } /** @@ -359,7 +364,7 @@ EXPORT_SYMBOL(call_usermodehelper_stdinpipe); * (ie. it runs with full root capabilities). */ int call_usermodehelper_exec(struct subprocess_info *sub_info, - int wait) + enum umh_wait wait) { DECLARE_COMPLETION_ONSTACK(done); int retval; @@ -378,7 +383,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, sub_info->wait = wait; queue_work(khelper_wq, &sub_info->work); - if (wait < 0) /* task has freed sub_info */ + if (wait == UMH_NO_WAIT) /* task has freed sub_info */ return 0; wait_for_completion(&done); retval = sub_info->retval; diff --git a/kernel/sys.c b/kernel/sys.c index aeded9a..18987c7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2327,7 +2327,7 @@ int orderly_poweroff(bool force) call_usermodehelper_setcleanup(info, argv_cleanup); - ret = call_usermodehelper_exec(info, -1); + ret = call_usermodehelper_exec(info, UMH_NO_WAIT); out: if (ret && force) { -- cgit v1.1 From 471d0558045fe35f8c5f291c1ee63815eb9c2dcd Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 12 Jul 2007 16:55:07 -0400 Subject: PM: Remove deprecated sysfs files This patch (as932) removes the deprecated sysfs .../power/state attribute files. Signed-off-by: Alan Stern Acked-by: Pavel Machek Signed-off-by: Greg Kroah-Hartman --- kernel/power/Kconfig | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 495b7d4..7332847 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -65,18 +65,6 @@ config PM_TRACE CAUTION: this option will cause your machine's real-time clock to be set to an invalid time after a resume. -config PM_SYSFS_DEPRECATED - bool "Driver model /sys/devices/.../power/state files (DEPRECATED)" - depends on PM && SYSFS - default n - help - The driver model started out with a sysfs file intended to provide - a userspace hook for device power management. This feature has never - worked very well, except for limited testing purposes, and so it will - be removed. It's not clear that a generic mechanism could really - handle the wide variability of device power states; any replacements - are likely to be bus or driver specific. - config SOFTWARE_SUSPEND bool "Software Suspend (Hibernation)" depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) -- cgit v1.1 From 83c54070ee1a2d05c89793884bea1a03f2851ed4 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 19 Jul 2007 01:47:05 -0700 Subject: mm: fault feedback #2 This patch completes Linus's wish that the fault return codes be made into bit flags, which I agree makes everything nicer. This requires requires all handle_mm_fault callers to be modified (possibly the modifications should go further and do things like fault accounting in handle_mm_fault -- however that would be for another patch). [akpm@linux-foundation.org: fix alpha build] [akpm@linux-foundation.org: fix s390 build] [akpm@linux-foundation.org: fix sparc build] [akpm@linux-foundation.org: fix sparc64 build] [akpm@linux-foundation.org: fix ia64 build] Signed-off-by: Nick Piggin Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Russell King Cc: Ian Molton Cc: Bryan Wu Cc: Mikael Starvik Cc: David Howells Cc: Yoshinori Sato Cc: "Luck, Tony" Cc: Hirokazu Takata Cc: Geert Uytterhoeven Cc: Roman Zippel Cc: Greg Ungerer Cc: Matthew Wilcox Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Paul Mundt Cc: Kazumoto Kojima Cc: Richard Curnow Cc: William Lee Irwin III Cc: "David S. Miller" Cc: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Cc: Miles Bader Cc: Chris Zankel Acked-by: Kyle McMartin Acked-by: Haavard Skinnemoen Acked-by: Ralf Baechle Acked-by: Andi Kleen Signed-off-by: Andrew Morton [ Still apparently needs some ARM and PPC loving - Linus ] Signed-off-by: Linus Torvalds --- kernel/futex.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 5c3f45d..a124250 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -346,15 +346,20 @@ static int futex_handle_fault(unsigned long address, vma = find_vma(mm, address); if (vma && address >= vma->vm_start && (vma->vm_flags & VM_WRITE)) { - switch (handle_mm_fault(mm, vma, address, 1)) { - case VM_FAULT_MINOR: - ret = 0; - current->min_flt++; - break; - case VM_FAULT_MAJOR: + int fault; + fault = handle_mm_fault(mm, vma, address, 1); + if (unlikely((fault & VM_FAULT_ERROR))) { +#if 0 + /* XXX: let's do this when we verify it is OK */ + if (ret & VM_FAULT_OOM) + ret = -ENOMEM; +#endif + } else { ret = 0; - current->maj_flt++; - break; + if (fault & VM_FAULT_MAJOR) + current->maj_flt++; + else + current->min_flt++; } } if (!fshared) -- cgit v1.1 From 328616e3b76859f1abdd08a8df1ddbb7bb81f807 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 Jul 2007 01:47:26 -0700 Subject: freezer: run show_state() when freezing times out To see which tasks are stuck where. Cc: "Rafael J. Wysocki" Cc: Oleg Nesterov Cc: Alan Stern Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index e0233d8..b850173 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -157,6 +157,7 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) freeze_user_space ? "user space processes" : "kernel threads", TIMEOUT / HZ, todo); + show_state(); read_lock(&tasklist_lock); do_each_thread(g, p) { if (freeze_user_space && !is_user_space(p)) -- cgit v1.1 From a0349828d6d6f95c445674c2953ee9db75c11f8f Mon Sep 17 00:00:00 2001 From: Ben Collins Date: Thu, 19 Jul 2007 01:47:27 -0700 Subject: PM: Do not require dev spew to get PM_DEBUG In order to enable things like PM_TRACE, you're required to enable PM_DEBUG, which sends a large spew of messages on boot, and often times can overflow dmesg buffer. Create new PM_VERBOSE and shift that to be the option that enables drivers/base/power's messages. Signed-off-by: Ben Collins Cc: "Rafael J. Wysocki" Cc: Pavel Machek Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/Kconfig | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 7332847..7358609 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -33,13 +33,20 @@ config PM_DEBUG bool "Power Management Debug Support" depends on PM ---help--- - This option enables verbose debugging support in the Power Management - code. This is helpful when debugging and reporting various PM bugs, - like suspend support. + This option enables various debugging support in the Power Management + code. This is helpful when debugging and reporting PM bugs, like + suspend support. + +config PM_VERBOSE + bool "Verbose Power Management debugging" + depends on PM_DEBUG + default n + ---help--- + This option enables verbose messages from the Power Management code. config DISABLE_CONSOLE_SUSPEND bool "Keep console(s) enabled during suspend/resume (DANGEROUS)" - depends on PM && PM_DEBUG + depends on PM_DEBUG default n ---help--- This option turns off the console suspend mechanism that prevents @@ -50,7 +57,7 @@ config DISABLE_CONSOLE_SUSPEND config PM_TRACE bool "Suspend/resume event tracing" - depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL + depends on PM_DEBUG && X86_32 && EXPERIMENTAL default n ---help--- This enables some cheesy code to save the last PM event point in the -- cgit v1.1 From 127067a9c994dff16b280f409cc7b18a54a63719 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 19 Jul 2007 01:47:28 -0700 Subject: swsusp: remove incorrect code from user.c In the face of the recent change of suspend code ordering (cf. http://marc.info/?l=linux-acpi&m=117938245931603&w=2) we should also modify the code ordering in swsusp so that hibernation_ops->prepare() is executed after device_suspend(). However, for this purpose it seems reasonable to eliminate the code duplication between kernel/power/disk.c and kernel/power/user.c first. By eliminating it we can reduce the size of user.c quite substantially and remove the maintenance difficulty with making essentially the same changes in two different places. Moreover, we should also remove the calls to "platform" functions from the restore code path, since it doesn't carry out any power transition of the system, but we generally need to disable the GPEs before the restore if the 'platform' hibernation mode has been used. To do this, we can introduce two new hibernation_ops to be used in the restore code. This patch: Make the code hibernation code in kernel/power/user.c be functionally equivalent to the corresponding code in kernel/power/disk.c , as it should be. The calls to the platform functions removed by this patch are incorrect. They should be replaced with some other "platform" invocations that will be introduced in one of the subsequent patches. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/user.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index d65305b..09468ec 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -181,34 +181,25 @@ static inline int snapshot_suspend(int platform_suspend) return error; } -static inline int snapshot_restore(int platform_suspend) +static inline int snapshot_restore(void) { int error; mutex_lock(&pm_mutex); pm_prepare_console(); - if (platform_suspend) { - error = platform_prepare(); - if (error) - goto Finish; - } suspend_console(); error = device_suspend(PMSG_PRETHAW); if (error) - goto Resume_devices; + goto Finish; error = disable_nonboot_cpus(); if (!error) error = swsusp_resume(); enable_nonboot_cpus(); - Resume_devices: - if (platform_suspend) - platform_finish(); - + Finish: device_resume(); resume_console(); - Finish: pm_restore_console(); mutex_unlock(&pm_mutex); return error; @@ -274,7 +265,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } - error = snapshot_restore(data->platform_suspend); + error = snapshot_restore(); break; case SNAPSHOT_FREE: -- cgit v1.1 From 7777fab989b5d006903188c966058ebcd2d6342a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 19 Jul 2007 01:47:29 -0700 Subject: swsusp: remove code duplication between disk.c and user.c Currently, much of the code in kernel/power/disk.c is duplicated in kernel/power/user.c , mainly for historical reasons. By eliminating this code duplication we can reduce the size of user.c quite substantially and remove the maintenance difficulty resulting from it. [bunk@stusta.de: kernel/power/disk.c: make code static] Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 184 +++++++++++++++++++++++++++++---------------------- kernel/power/power.h | 5 +- kernel/power/user.c | 96 ++------------------------- 3 files changed, 115 insertions(+), 170 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index f445b9c..47882bf 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -45,7 +45,7 @@ enum { static int hibernation_mode = HIBERNATION_SHUTDOWN; -struct hibernation_ops *hibernation_ops; +static struct hibernation_ops *hibernation_ops; /** * hibernation_set_ops - set the global hibernate operations @@ -74,9 +74,9 @@ void hibernation_set_ops(struct hibernation_ops *ops) * platform driver if so configured and return an error code if it fails */ -static int platform_prepare(void) +static int platform_prepare(int platform_mode) { - return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ? + return (platform_mode && hibernation_ops) ? hibernation_ops->prepare() : 0; } @@ -85,13 +85,104 @@ static int platform_prepare(void) * using the platform driver (must be called after platform_prepare()) */ -static void platform_finish(void) +static void platform_finish(int platform_mode) { - if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) + if (platform_mode && hibernation_ops) hibernation_ops->finish(); } /** + * hibernation_snapshot - quiesce devices and create the hibernation + * snapshot image. + * @platform_mode - if set, use the platform driver, if available, to + * prepare the platform frimware for the power transition. + * + * Must be called with pm_mutex held + */ + +int hibernation_snapshot(int platform_mode) +{ + int error; + + /* Free memory before shutting down devices. */ + error = swsusp_shrink_memory(); + if (error) + goto Finish; + + error = platform_prepare(platform_mode); + if (error) + goto Finish; + + suspend_console(); + error = device_suspend(PMSG_FREEZE); + if (error) + goto Resume_devices; + + error = disable_nonboot_cpus(); + if (!error) { + if (hibernation_mode != HIBERNATION_TEST) { + in_suspend = 1; + error = swsusp_suspend(); + /* Control returns here after successful restore */ + } else { + printk("swsusp debug: Waiting for 5 seconds.\n"); + mdelay(5000); + } + } + enable_nonboot_cpus(); + Resume_devices: + platform_finish(platform_mode); + device_resume(); + resume_console(); + Finish: + return error; +} + +/** + * hibernation_restore - quiesce devices and restore the hibernation + * snapshot image. If successful, control returns in hibernation_snaphot() + * + * Must be called with pm_mutex held + */ + +int hibernation_restore(void) +{ + int error; + + pm_prepare_console(); + suspend_console(); + error = device_suspend(PMSG_PRETHAW); + if (error) + goto Finish; + + error = disable_nonboot_cpus(); + if (!error) + error = swsusp_resume(); + + enable_nonboot_cpus(); + Finish: + device_resume(); + resume_console(); + pm_restore_console(); + return error; +} + +/** + * hibernation_platform_enter - enter the hibernation state using the + * platform driver (if available) + */ + +int hibernation_platform_enter(void) +{ + if (hibernation_ops) { + kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); + return hibernation_ops->enter(); + } else { + return -ENOSYS; + } +} + +/** * power_down - Shut the machine down for hibernation. * * Use the platform driver, if configured so; otherwise try @@ -111,11 +202,7 @@ static void power_down(void) kernel_restart(NULL); break; case HIBERNATION_PLATFORM: - if (hibernation_ops) { - kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); - hibernation_ops->enter(); - break; - } + hibernation_platform_enter(); } kernel_halt(); /* @@ -171,62 +258,17 @@ int hibernate(void) mdelay(5000); goto Thaw; } - - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); - if (error) - goto Thaw; - - error = platform_prepare(); - if (error) - goto Thaw; - - suspend_console(); - error = device_suspend(PMSG_FREEZE); - if (error) { - printk(KERN_ERR "PM: Some devices failed to suspend\n"); - goto Resume_devices; - } - error = disable_nonboot_cpus(); - if (error) - goto Enable_cpus; - - if (hibernation_mode == HIBERNATION_TEST) { - printk("swsusp debug: Waiting for 5 seconds.\n"); - mdelay(5000); - goto Enable_cpus; - } - - pr_debug("PM: snapshotting memory.\n"); - in_suspend = 1; - error = swsusp_suspend(); - if (error) - goto Enable_cpus; - - if (in_suspend) { - enable_nonboot_cpus(); - platform_finish(); - device_resume(); - resume_console(); + error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); + if (in_suspend && !error) { pr_debug("PM: writing image.\n"); error = swsusp_write(); + swsusp_free(); if (!error) power_down(); - else { - swsusp_free(); - goto Thaw; - } } else { pr_debug("PM: Image restored successfully.\n"); + swsusp_free(); } - - swsusp_free(); - Enable_cpus: - enable_nonboot_cpus(); - Resume_devices: - platform_finish(); - device_resume(); - resume_console(); Thaw: mutex_unlock(&pm_mutex); unprepare_processes(); @@ -301,29 +343,11 @@ static int software_resume(void) pr_debug("PM: Reading swsusp image.\n"); error = swsusp_read(); - if (error) { - swsusp_free(); - goto Thaw; - } - - pr_debug("PM: Preparing devices for restore.\n"); - - suspend_console(); - error = device_suspend(PMSG_PRETHAW); - if (error) - goto Free; - - error = disable_nonboot_cpus(); if (!error) - swsusp_resume(); + hibernation_restore(); - enable_nonboot_cpus(); - Free: - swsusp_free(); - device_resume(); - resume_console(); - Thaw: printk(KERN_ERR "PM: Restore failed, recovering.\n"); + swsusp_free(); unprepare_processes(); Done: free_basic_memory_bitmaps(); @@ -333,7 +357,7 @@ static int software_resume(void) Unlock: mutex_unlock(&pm_mutex); pr_debug("PM: Resume from disk failed.\n"); - return 0; + return error; } late_initcall(software_resume); diff --git a/kernel/power/power.h b/kernel/power/power.h index 5138148..70c378b 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -25,7 +25,10 @@ struct swsusp_info { */ #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) -extern struct hibernation_ops *hibernation_ops; +/* kernel/power/disk.c */ +extern int hibernation_snapshot(int platform_mode); +extern int hibernation_restore(void); +extern int hibernation_platform_enter(void); #endif extern int pfn_is_nosave(unsigned long); diff --git a/kernel/power/user.c b/kernel/power/user.c index 09468ec..bfed3b9 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -128,83 +128,6 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, return res; } -static inline int platform_prepare(void) -{ - int error = 0; - - if (hibernation_ops) - error = hibernation_ops->prepare(); - - return error; -} - -static inline void platform_finish(void) -{ - if (hibernation_ops) - hibernation_ops->finish(); -} - -static inline int snapshot_suspend(int platform_suspend) -{ - int error; - - mutex_lock(&pm_mutex); - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); - if (error) - goto Finish; - - if (platform_suspend) { - error = platform_prepare(); - if (error) - goto Finish; - } - suspend_console(); - error = device_suspend(PMSG_FREEZE); - if (error) - goto Resume_devices; - - error = disable_nonboot_cpus(); - if (!error) { - in_suspend = 1; - error = swsusp_suspend(); - } - enable_nonboot_cpus(); - Resume_devices: - if (platform_suspend) - platform_finish(); - - device_resume(); - resume_console(); - Finish: - mutex_unlock(&pm_mutex); - return error; -} - -static inline int snapshot_restore(void) -{ - int error; - - mutex_lock(&pm_mutex); - pm_prepare_console(); - suspend_console(); - error = device_suspend(PMSG_PRETHAW); - if (error) - goto Finish; - - error = disable_nonboot_cpus(); - if (!error) - error = swsusp_resume(); - - enable_nonboot_cpus(); - Finish: - device_resume(); - resume_console(); - pm_restore_console(); - mutex_unlock(&pm_mutex); - return error; -} - static int snapshot_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { @@ -251,7 +174,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } - error = snapshot_suspend(data->platform_suspend); + error = hibernation_snapshot(data->platform_suspend); if (!error) error = put_user(in_suspend, (unsigned int __user *)arg); if (!error) @@ -265,7 +188,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } - error = snapshot_restore(); + error = hibernation_restore(); break; case SNAPSHOT_FREE: @@ -377,19 +300,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, switch (arg) { case PMOPS_PREPARE: - if (hibernation_ops) { - data->platform_suspend = 1; - error = 0; - } else { - error = -ENOSYS; - } + data->platform_suspend = 1; + error = 0; break; case PMOPS_ENTER: - if (data->platform_suspend) { - kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); - error = hibernation_ops->enter(); - } + if (data->platform_suspend) + error = hibernation_platform_enter(); + break; case PMOPS_FINISH: -- cgit v1.1 From a634cc10164d1c229fbeca33923e6a0ed939e894 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 19 Jul 2007 01:47:30 -0700 Subject: swsusp: introduce restore platform operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At least on some machines it is necessary to prepare the ACPI firmware for the restoration of the system memory state from the hibernation image if the "platform" mode of hibernation has been used. Namely, in that cases we need to disable the GPEs before replacing the "boot" kernel with the "frozen" kernel (cf. http://bugzilla.kernel.org/show_bug.cgi?id=7887). After the restore they will be re-enabled by hibernation_ops->finish(), but if the restore fails, they have to be re-enabled by the restore code explicitly. For this purpose we can introduce two additional hibernation operations, called pre_restore() and restore_cleanup() and call them from the restore code path. Still, they should be called if the "platform" mode of hibernation has been used, so we need to pass the information about the hibernation mode from the "frozen" kernel to the "boot" kernel in the image header. Apparently, we can't drop the disabling of GPEs before the restore because of Bug #7887 .  We also can't do it unconditionally, because the GPEs wouldn't have been enabled after a successful restore if the suspend had been done in the 'shutdown' or 'reboot' mode. In principle we could (and probably should) unconditionally disable the GPEs before each snapshot creation *and* before the restore, but then we'd have to unconditionally enable them after the snapshot creation as well as after the restore (or restore failure)   Still, for this purpose we'd need to modify acpi_enter_sleep_state_prep() and acpi_leave_sleep_state() and we'd have to introduce some mechanism synchronizing the disablind/enabling of the GPEs with the device drivers' .suspend()/.resume() routines and with disable_/enable_nonboot_cpus().  However, this would have affected the suspend (ie. s2ram) code as well as the hibernation, which I'd like to avoid in this patch series. Signed-off-by: Rafael J. Wysocki Cc: Nigel Cunningham Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 56 ++++++++++++++++++++++++++++++++++++++++++---------- kernel/power/power.h | 13 +++++++++--- kernel/power/swap.c | 20 ++++++++++++++----- kernel/power/user.c | 2 +- 4 files changed, 72 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 47882bf..fa3b43b 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -54,7 +54,8 @@ static struct hibernation_ops *hibernation_ops; void hibernation_set_ops(struct hibernation_ops *ops) { - if (ops && !(ops->prepare && ops->enter && ops->finish)) { + if (ops && !(ops->prepare && ops->enter && ops->finish + && ops->pre_restore && ops->restore_cleanup)) { WARN_ON(1); return; } @@ -92,6 +93,31 @@ static void platform_finish(int platform_mode) } /** + * platform_pre_restore - prepare the platform for the restoration from a + * hibernation image. If the restore fails after this function has been + * called, platform_restore_cleanup() must be called. + */ + +static int platform_pre_restore(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->pre_restore() : 0; +} + +/** + * platform_restore_cleanup - switch the platform to the normal mode of + * operation after a failing restore. If platform_pre_restore() has been + * called before the failing restore, this function must be called too, + * regardless of the result of platform_pre_restore(). + */ + +static void platform_restore_cleanup(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->restore_cleanup(); +} + +/** * hibernation_snapshot - quiesce devices and create the hibernation * snapshot image. * @platform_mode - if set, use the platform driver, if available, to @@ -141,11 +167,13 @@ int hibernation_snapshot(int platform_mode) /** * hibernation_restore - quiesce devices and restore the hibernation * snapshot image. If successful, control returns in hibernation_snaphot() + * @platform_mode - if set, use the platform driver, if available, to + * prepare the platform frimware for the transition. * * Must be called with pm_mutex held */ -int hibernation_restore(void) +int hibernation_restore(int platform_mode) { int error; @@ -155,11 +183,14 @@ int hibernation_restore(void) if (error) goto Finish; - error = disable_nonboot_cpus(); - if (!error) - error = swsusp_resume(); - - enable_nonboot_cpus(); + error = platform_pre_restore(platform_mode); + if (!error) { + error = disable_nonboot_cpus(); + if (!error) + error = swsusp_resume(); + enable_nonboot_cpus(); + } + platform_restore_cleanup(platform_mode); Finish: device_resume(); resume_console(); @@ -260,8 +291,12 @@ int hibernate(void) } error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); if (in_suspend && !error) { + unsigned int flags = 0; + + if (hibernation_mode == HIBERNATION_PLATFORM) + flags |= SF_PLATFORM_MODE; pr_debug("PM: writing image.\n"); - error = swsusp_write(); + error = swsusp_write(flags); swsusp_free(); if (!error) power_down(); @@ -295,6 +330,7 @@ int hibernate(void) static int software_resume(void) { int error; + unsigned int flags; mutex_lock(&pm_mutex); if (!swsusp_resume_device) { @@ -342,9 +378,9 @@ static int software_resume(void) pr_debug("PM: Reading swsusp image.\n"); - error = swsusp_read(); + error = swsusp_read(&flags); if (!error) - hibernation_restore(); + hibernation_restore(flags & SF_PLATFORM_MODE); printk(KERN_ERR "PM: Restore failed, recovering.\n"); swsusp_free(); diff --git a/kernel/power/power.h b/kernel/power/power.h index 70c378b..eab3603 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -27,7 +27,7 @@ struct swsusp_info { /* kernel/power/disk.c */ extern int hibernation_snapshot(int platform_mode); -extern int hibernation_restore(void); +extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); #endif @@ -155,13 +155,20 @@ extern sector_t alloc_swapdev_block(int swap); extern void free_all_swap_pages(int swap); extern int swsusp_swap_in_use(void); +/* + * Flags that can be passed from the hibernatig hernel to the "boot" kernel in + * the image header. + */ +#define SF_PLATFORM_MODE 1 + +/* kernel/power/disk.c */ extern int swsusp_check(void); extern int swsusp_shrink_memory(void); extern void swsusp_free(void); extern int swsusp_suspend(void); extern int swsusp_resume(void); -extern int swsusp_read(void); -extern int swsusp_write(void); +extern int swsusp_read(unsigned int *flags_p); +extern int swsusp_write(unsigned int flags); extern void swsusp_close(void); extern int suspend_enter(suspend_state_t state); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8b1a1b8..917aba1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -33,8 +33,9 @@ extern char resume_file[]; #define SWSUSP_SIG "S1SUSPEND" struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(sector_t)]; + char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; sector_t image; + unsigned int flags; /* Flags to pass to the "boot" kernel */ char orig_sig[10]; char sig[10]; } __attribute__((packed)); @@ -138,7 +139,7 @@ static int wait_on_bio_chain(struct bio **bio_chain) * Saving part */ -static int mark_swapfiles(sector_t start) +static int mark_swapfiles(sector_t start, unsigned int flags) { int error; @@ -148,6 +149,7 @@ static int mark_swapfiles(sector_t start) memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); memcpy(swsusp_header->sig,SWSUSP_SIG, 10); swsusp_header->image = start; + swsusp_header->flags = flags; error = bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { @@ -369,6 +371,7 @@ static int enough_swap(unsigned int nr_pages) /** * swsusp_write - Write entire image and metadata. + * @flags: flags to pass to the "boot" kernel in the image header * * It is important _NOT_ to umount filesystems at this point. We want * them synced (in case something goes wrong) but we DO not want to mark @@ -376,7 +379,7 @@ static int enough_swap(unsigned int nr_pages) * correctly, we'll mark system clean, anyway.) */ -int swsusp_write(void) +int swsusp_write(unsigned int flags) { struct swap_map_handle handle; struct snapshot_handle snapshot; @@ -415,7 +418,7 @@ int swsusp_write(void) if (!error) { flush_swap_writer(&handle); printk("S"); - error = mark_swapfiles(start); + error = mark_swapfiles(start, flags); printk("|\n"); } } @@ -540,13 +543,20 @@ static int load_image(struct swap_map_handle *handle, return error; } -int swsusp_read(void) +/** + * swsusp_read - read the hibernation image. + * @flags_p: flags passed by the "frozen" kernel in the image header should + * be written into this memeory location + */ + +int swsusp_read(unsigned int *flags_p) { int error; struct swap_map_handle handle; struct snapshot_handle snapshot; struct swsusp_info *header; + *flags_p = swsusp_header->flags; if (IS_ERR(resume_bdev)) { pr_debug("swsusp: block device not initialised\n"); return PTR_ERR(resume_bdev); diff --git a/kernel/power/user.c b/kernel/power/user.c index bfed3b9..1f24f30 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -188,7 +188,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } - error = hibernation_restore(); + error = hibernation_restore(data->platform_suspend); break; case SNAPSHOT_FREE: -- cgit v1.1 From 10a1803d667e209914eaada9b95525252f23ec78 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 19 Jul 2007 01:47:31 -0700 Subject: swsusp: fix hibernation code ordering Change the code ordering so that hibernation_ops->prepare() is called after device_suspend(). This is needed so that we don't violate the ACPI specification, which states that the _PTS and _GTS system-control methods, executed from acpi_sleep_prepare(), ought to be called after devices have been put in low power states. The "Finish" label in hibernation_restore() is moved, because device_suspend() resumes devices if the suspending of them fails and the restore code ordering should reflect the hibernation code ordering. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index fa3b43b..77ac605 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -133,15 +133,15 @@ int hibernation_snapshot(int platform_mode) /* Free memory before shutting down devices. */ error = swsusp_shrink_memory(); if (error) - goto Finish; - - error = platform_prepare(platform_mode); - if (error) - goto Finish; + return error; suspend_console(); error = device_suspend(PMSG_FREEZE); if (error) + goto Resume_console; + + error = platform_prepare(platform_mode); + if (error) goto Resume_devices; error = disable_nonboot_cpus(); @@ -159,8 +159,8 @@ int hibernation_snapshot(int platform_mode) Resume_devices: platform_finish(platform_mode); device_resume(); + Resume_console: resume_console(); - Finish: return error; } @@ -191,8 +191,8 @@ int hibernation_restore(int platform_mode) enable_nonboot_cpus(); } platform_restore_cleanup(platform_mode); - Finish: device_resume(); + Finish: resume_console(); pm_restore_console(); return error; -- cgit v1.1 From b1457bcc3a00a0446c7f6e2f22fd24b6d8d0a309 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 19 Jul 2007 01:47:31 -0700 Subject: Hibernation: prepare to enter the low power state During hibernation we call hibernation_ops->prepare() before creating the image, but then, before saving it, we cancel the power transition by calling hibernation_ops->finish(). Thus prior to calling hibernation_ops->enter() we should let the platform firmware know that we're going to enter the low power state after all. Signed-off-by: Rafael J. Wysocki Cc: Gautham R Shenoy Cc: Pavel Machek Cc: Nigel Cunningham Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 77ac605..885c653 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -205,12 +205,23 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { + int error; + if (hibernation_ops) { kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); - return hibernation_ops->enter(); + /* + * We have cancelled the power transition by running + * hibernation_ops->finish() before saving the image, so we + * should let the firmware know that we're going to enter the + * sleep state after all + */ + error = hibernation_ops->prepare(); + if (!error) + error = hibernation_ops->enter(); } else { - return -ENOSYS; + error = -ENOSYS; } + return error; } /** -- cgit v1.1 From 0c1eecfb345401629aa57c9d3b077273e56c45a7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 19 Jul 2007 01:47:33 -0700 Subject: Freezer: avoid freezing kernel threads prematurely Kernel threads should not have TIF_FREEZE set when user space processes are being frozen, since otherwise some of them might be frozen prematurely. To prevent this from happening we can (1) make exit_mm() unset TIF_FREEZE unconditionally just after clearing tsk->mm and (2) make try_to_freeze_tasks() check if p->mm is different from zero and PF_BORROWED_MM is unset in p->flags when user space processes are to be frozen. Namely, when user space processes are being frozen, we only should set TIF_FREEZE for tasks that have p->mm different from NULL and don't have PF_BORROWED_MM set in p->flags. For this reason task_lock() must be used to prevent try_to_freeze_tasks() from racing with use_mm()/unuse_mm(), in which p->mm and p->flags.PF_BORROWED_MM are changed under task_lock(p). Also, we need to prevent the following scenario from happening: * daemonize() is called by a task spawned from a user space code path * freezer checks if the task has p->mm set and the result is positive * task enters exit_mm() and clears its TIF_FREEZE * freezer sets TIF_FREEZE for the task * task calls try_to_freeze() and goes to the refrigerator, which is wrong at that point This requires us to acquire task_lock(p) before p->flags.PF_BORROWED_MM and p->mm are examined and release it after TIF_FREEZE is set for p (or it turns out that TIF_FREEZE should not be set). Signed-off-by: Rafael J. Wysocki Cc: Gautham R Shenoy Cc: Pavel Machek Cc: Nigel Cunningham Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 +++ kernel/power/process.c | 64 ++++++++++++++++++++++++++------------------------ 2 files changed, 36 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index e8af8d0..464c2b1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -594,6 +595,8 @@ static void exit_mm(struct task_struct * tsk) tsk->mm = NULL; up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); + /* We don't want this task to be frozen prematurely */ + clear_freeze_flag(tsk); task_unlock(tsk); mmput(mm); } diff --git a/kernel/power/process.c b/kernel/power/process.c index b850173..e1bcded 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -40,7 +40,7 @@ static inline void frozen_process(void) current->flags |= PF_FROZEN; wmb(); } - clear_tsk_thread_flag(current, TIF_FREEZE); + clear_freeze_flag(current); } /* Refrigerator is place where frozen processes are stored :-). */ @@ -75,17 +75,16 @@ void refrigerator(void) current->state = save; } -static inline void freeze_process(struct task_struct *p) +static void freeze_task(struct task_struct *p) { unsigned long flags; if (!freezing(p)) { rmb(); if (!frozen(p)) { + set_freeze_flag(p); if (p->state == TASK_STOPPED) force_sig_specific(SIGSTOP, p); - - freeze(p); spin_lock_irqsave(&p->sighand->siglock, flags); signal_wake_up(p, p->state == TASK_STOPPED); spin_unlock_irqrestore(&p->sighand->siglock, flags); @@ -99,18 +98,13 @@ static void cancel_freezing(struct task_struct *p) if (freezing(p)) { pr_debug(" clean up: %s\n", p->comm); - do_not_freeze(p); + clear_freeze_flag(p); spin_lock_irqsave(&p->sighand->siglock, flags); recalc_sigpending_and_wake(p); spin_unlock_irqrestore(&p->sighand->siglock, flags); } } -static inline int is_user_space(struct task_struct *p) -{ - return p->mm && !(p->flags & PF_BORROWED_MM); -} - static unsigned int try_to_freeze_tasks(int freeze_user_space) { struct task_struct *g, *p; @@ -122,20 +116,34 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { - if (!freezeable(p)) - continue; - - if (frozen(p)) + if (frozen(p) || !freezeable(p)) continue; - if (p->state == TASK_TRACED && frozen(p->parent)) { - cancel_freezing(p); - continue; + if (freeze_user_space) { + if (p->state == TASK_TRACED && + frozen(p->parent)) { + cancel_freezing(p); + continue; + } + /* + * Kernel threads should not have TIF_FREEZE set + * at this point, so we must ensure that either + * p->mm is not NULL *and* PF_BORROWED_MM is + * unset, or TIF_FRREZE is left unset. + * The task_lock() is necessary to prevent races + * with exit_mm() or use_mm()/unuse_mm() from + * occuring. + */ + task_lock(p); + if (!p->mm || (p->flags & PF_BORROWED_MM)) { + task_unlock(p); + continue; + } + freeze_task(p); + task_unlock(p); + } else { + freeze_task(p); } - if (freeze_user_space && !is_user_space(p)) - continue; - - freeze_process(p); if (!freezer_should_skip(p)) todo++; } while_each_thread(g, p); @@ -152,22 +160,16 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) * but it cleans up leftover PF_FREEZE requests. */ printk("\n"); - printk(KERN_ERR "Stopping %s timed out after %d seconds " + printk(KERN_ERR "Freezing of %s timed out after %d seconds " "(%d tasks refusing to freeze):\n", - freeze_user_space ? "user space processes" : - "kernel threads", + freeze_user_space ? "user space " : "tasks ", TIMEOUT / HZ, todo); show_state(); read_lock(&tasklist_lock); do_each_thread(g, p) { - if (freeze_user_space && !is_user_space(p)) - continue; - task_lock(p); - if (freezeable(p) && !frozen(p) && - !freezer_should_skip(p)) + if (freezing(p) && !freezer_should_skip(p)) printk(KERN_ERR " %s\n", p->comm); - cancel_freezing(p); task_unlock(p); } while_each_thread(g, p); @@ -211,7 +213,7 @@ static void thaw_tasks(int thaw_user_space) if (!freezeable(p)) continue; - if (is_user_space(p) == !thaw_user_space) + if (!p->mm == thaw_user_space) continue; thaw_process(p); -- cgit v1.1 From f4a3a7d60c9c9a961e4c970f6eb41dd1c9d3ec21 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 19 Jul 2007 01:47:33 -0700 Subject: Freezer: use __set_current_state in refrigerator Use __set_current_state() as appropriate in refrigerator() instead of accessing current->state directly. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Gautham R Shenoy Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index e1bcded..9b5301c 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -72,7 +72,7 @@ void refrigerator(void) schedule(); } pr_debug("%s left refrigerator\n", current->comm); - current->state = save; + __set_current_state(save); } static void freeze_task(struct task_struct *p) -- cgit v1.1 From e7cd8a722745a01bcfac4d4a52d53391d177da20 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 19 Jul 2007 01:47:34 -0700 Subject: Freezer: return int from freeze_processes Make try_to_freeze_tasks() and freeze_processes() return -EBUSY on failure instead of the number of unfrozen tasks (none of the callers actually uses this number). Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Gautham R Shenoy Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 9b5301c..00cdbe5 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -105,7 +105,7 @@ static void cancel_freezing(struct task_struct *p) } } -static unsigned int try_to_freeze_tasks(int freeze_user_space) +static int try_to_freeze_tasks(int freeze_user_space) { struct task_struct *g, *p; unsigned long end_time; @@ -176,28 +176,25 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) read_unlock(&tasklist_lock); } - return todo; + return todo ? -EBUSY : 0; } /** * freeze_processes - tell processes to enter the refrigerator - * - * Returns 0 on success, or the number of processes that didn't freeze, - * although they were told to. */ int freeze_processes(void) { - unsigned int nr_unfrozen; + int error; printk("Stopping tasks ... "); - nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE); - if (nr_unfrozen) - return nr_unfrozen; + error = try_to_freeze_tasks(FREEZER_USER_SPACE); + if (error) + return error; sys_sync(); - nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); - if (nr_unfrozen) - return nr_unfrozen; + error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); + if (error) + return error; printk("done.\n"); BUG_ON(in_atomic()); -- cgit v1.1 From c2cf7d87d804c66e063829d5ca739053e901dc15 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 19 Jul 2007 01:47:35 -0700 Subject: Freezer: remove redundant check in try_to_freeze_tasks We don't need to check if todo is positive before calling time_after() in try_to_freeze_tasks(), because if todo is zero at this point, the loop will be broken anyway due to the while () condition being false. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Gautham R Shenoy Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 00cdbe5..3434940 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -149,7 +149,7 @@ static int try_to_freeze_tasks(int freeze_user_space) } while_each_thread(g, p); read_unlock(&tasklist_lock); yield(); /* Yield is okay here */ - if (todo && time_after(jiffies, end_time)) + if (time_after(jiffies, end_time)) break; } while (todo); -- cgit v1.1 From b10d911749d37dccfa5873d2088aea3f074b9e45 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 19 Jul 2007 01:47:36 -0700 Subject: PM: introduce hibernation and suspend notifiers Make it possible to register hibernation and suspend notifiers, so that subsystems can perform hibernation-related or suspend-related operations that should not be carried out by device drivers' .suspend() and .resume() routines. [akpm@linux-foundation.org: build fixes] [akpm@linux-foundation.org: cleanups] Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 16 ++++++++++++---- kernel/power/main.c | 9 +++++++++ kernel/power/power.h | 10 ++++++++++ kernel/power/user.c | 11 ++++++++--- 4 files changed, 39 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 885c653..324ac01 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -281,9 +281,16 @@ int hibernate(void) { int error; + mutex_lock(&pm_mutex); /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) - return -EBUSY; + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + goto Unlock; + } + + error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + if (error) + goto Exit; /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); @@ -294,7 +301,6 @@ int hibernate(void) if (error) goto Finish; - mutex_lock(&pm_mutex); if (hibernation_mode == HIBERNATION_TESTPROC) { printk("swsusp debug: Waiting for 5 seconds.\n"); mdelay(5000); @@ -316,12 +322,14 @@ int hibernate(void) swsusp_free(); } Thaw: - mutex_unlock(&pm_mutex); unprepare_processes(); Finish: free_basic_memory_bitmaps(); Exit: + pm_notifier_call_chain(PM_POST_HIBERNATION); atomic_inc(&snapshot_device_available); + Unlock: + mutex_unlock(&pm_mutex); return error; } diff --git a/kernel/power/main.c b/kernel/power/main.c index fc45ed2..4d26ad3 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -23,6 +23,8 @@ #include "power.h" +BLOCKING_NOTIFIER_HEAD(pm_chain_head); + /*This is just an arbitrary number */ #define FREE_PAGE_NUMBER (100) @@ -78,6 +80,10 @@ static int suspend_prepare(suspend_state_t state) if (!pm_ops || !pm_ops->enter) return -EPERM; + error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); + if (error) + goto Finish; + pm_prepare_console(); if (freeze_processes()) { @@ -125,6 +131,8 @@ static int suspend_prepare(suspend_state_t state) Thaw: thaw_processes(); pm_restore_console(); + Finish: + pm_notifier_call_chain(PM_POST_SUSPEND); return error; } @@ -176,6 +184,7 @@ static void suspend_finish(suspend_state_t state) resume_console(); thaw_processes(); pm_restore_console(); + pm_notifier_call_chain(PM_POST_SUSPEND); } diff --git a/kernel/power/power.h b/kernel/power/power.h index eab3603..01c2275 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -173,5 +173,15 @@ extern void swsusp_close(void); extern int suspend_enter(suspend_state_t state); struct timeval; +/* kernel/power/swsusp.c */ extern void swsusp_show_speed(struct timeval *, struct timeval *, unsigned int, char *); + +/* kernel/power/main.c */ +extern struct blocking_notifier_head pm_chain_head; + +static inline int pm_notifier_call_chain(unsigned long val) +{ + return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) + == NOTIFY_BAD) ? -EINVAL : 0; +} diff --git a/kernel/power/user.c b/kernel/power/user.c index 1f24f30..7f19afe 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -151,10 +151,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, if (data->frozen) break; mutex_lock(&pm_mutex); - if (freeze_processes()) { - thaw_processes(); - error = -EBUSY; + error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + if (!error) { + error = freeze_processes(); + if (error) + thaw_processes(); } + if (error) + pm_notifier_call_chain(PM_POST_HIBERNATION); mutex_unlock(&pm_mutex); if (!error) data->frozen = 1; @@ -165,6 +169,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; mutex_lock(&pm_mutex); thaw_processes(); + pm_notifier_call_chain(PM_POST_HIBERNATION); mutex_unlock(&pm_mutex); data->frozen = 0; break; -- cgit v1.1 From 8cdd4936c17bd8085cb0dfacc4a37ccf8d0ada7b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 19 Jul 2007 01:47:36 -0700 Subject: PM: disable usermode helper before hibernation and suspend Use a hibernation and suspend notifier to disable the user mode helper before a hibernation/suspend and enable it after the operation. [akpm@linux-foundation.org: build fix] Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Acked-by: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 78d365c..928f367 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -33,12 +33,22 @@ #include #include #include +#include +#include #include extern int max_threads; static struct workqueue_struct *khelper_wq; +/* + * If set, both call_usermodehelper_keys() and call_usermodehelper_pipe() exit + * immediately returning -EBUSY. Used for preventing user land processes from + * being created after the user land has been frozen during a system-wide + * hibernation or suspend operation. + */ +static int usermodehelper_disabled; + #ifdef CONFIG_KMOD /* @@ -265,6 +275,24 @@ static void __call_usermodehelper(struct work_struct *work) } } +static int usermodehelper_pm_callback(struct notifier_block *nfb, + unsigned long action, + void *ignored) +{ + switch (action) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: + usermodehelper_disabled = 1; + return NOTIFY_OK; + case PM_POST_HIBERNATION: + case PM_POST_SUSPEND: + usermodehelper_disabled = 0; + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + /** * call_usermodehelper_setup - prepare to call a usermode helper * @path - path to usermode executable @@ -374,7 +402,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, goto out; } - if (!khelper_wq) { + if (!khelper_wq || usermodehelper_disabled) { retval = -EBUSY; goto out; } @@ -431,4 +459,5 @@ void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); BUG_ON(!khelper_wq); + pm_notifier(usermodehelper_pm_callback, 0); } -- cgit v1.1 From ccd4b65aef4be2278543fde5b999e55a4d694fd8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 19 Jul 2007 01:47:37 -0700 Subject: PM: prevent frozen user mode helpers from failing the freezing of tasks At present, if a user mode helper is running while usermodehelper_pm_callback() is executed, the helper may be frozen and the completion in call_usermodehelper_exec() won't be completed until user space processes are thawed. As a result, the freezing of kernel threads may fail, which is not desirable. Prevent this from happening by introducing a counter of running user mode helpers and allowing usermodehelper_pm_callback() to succeed for action = PM_HIBERNATION_PREPARE or action = PM_SUSPEND_PREPARE only if there are no helpers running. [Namely, usermodehelper_pm_callback() waits for at most RUNNING_HELPERS_TIMEOUT for the number of running helpers to become zero and fails if that doesn't happen.] Special thanks to Uli Luckas , Pavel Machek and Oleg Nesterov for reviewing the previous versions of this patch and for very useful comments. Signed-off-by: Rafael J. Wysocki Acked-by: Uli Luckas Acked-by: Nigel Cunningham Acked-by: Pavel Machek Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 928f367..beedbdc 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -41,14 +41,6 @@ extern int max_threads; static struct workqueue_struct *khelper_wq; -/* - * If set, both call_usermodehelper_keys() and call_usermodehelper_pipe() exit - * immediately returning -EBUSY. Used for preventing user land processes from - * being created after the user land has been frozen during a system-wide - * hibernation or suspend operation. - */ -static int usermodehelper_disabled; - #ifdef CONFIG_KMOD /* @@ -275,15 +267,55 @@ static void __call_usermodehelper(struct work_struct *work) } } +#ifdef CONFIG_PM +/* + * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY + * (used for preventing user land processes from being created after the user + * land has been frozen during a system-wide hibernation or suspend operation). + */ +static int usermodehelper_disabled; + +/* Number of helpers running */ +static atomic_t running_helpers = ATOMIC_INIT(0); + +/* + * Wait queue head used by usermodehelper_pm_callback() to wait for all running + * helpers to finish. + */ +static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); + +/* + * Time to wait for running_helpers to become zero before the setting of + * usermodehelper_disabled in usermodehelper_pm_callback() fails + */ +#define RUNNING_HELPERS_TIMEOUT (5 * HZ) + static int usermodehelper_pm_callback(struct notifier_block *nfb, unsigned long action, void *ignored) { + long retval; + switch (action) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: usermodehelper_disabled = 1; - return NOTIFY_OK; + smp_mb(); + /* + * From now on call_usermodehelper_exec() won't start any new + * helpers, so it is sufficient if running_helpers turns out to + * be zero at one point (it may be increased later, but that + * doesn't matter). + */ + retval = wait_event_timeout(running_helpers_waitq, + atomic_read(&running_helpers) == 0, + RUNNING_HELPERS_TIMEOUT); + if (retval) { + return NOTIFY_OK; + } else { + usermodehelper_disabled = 0; + return NOTIFY_BAD; + } case PM_POST_HIBERNATION: case PM_POST_SUSPEND: usermodehelper_disabled = 0; @@ -293,6 +325,30 @@ static int usermodehelper_pm_callback(struct notifier_block *nfb, return NOTIFY_DONE; } +static void helper_lock(void) +{ + atomic_inc(&running_helpers); + smp_mb__after_atomic_inc(); +} + +static void helper_unlock(void) +{ + if (atomic_dec_and_test(&running_helpers)) + wake_up(&running_helpers_waitq); +} + +static void register_pm_notifier_callback(void) +{ + pm_notifier(usermodehelper_pm_callback, 0); +} +#else /* CONFIG_PM */ +#define usermodehelper_disabled 0 + +static inline void helper_lock(void) {} +static inline void helper_unlock(void) {} +static inline void register_pm_notifier_callback(void) {} +#endif /* CONFIG_PM */ + /** * call_usermodehelper_setup - prepare to call a usermode helper * @path - path to usermode executable @@ -397,6 +453,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, DECLARE_COMPLETION_ONSTACK(done); int retval; + helper_lock(); if (sub_info->path[0] == '\0') { retval = 0; goto out; @@ -418,6 +475,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, out: call_usermodehelper_freeinfo(sub_info); + helper_unlock(); return retval; } EXPORT_SYMBOL(call_usermodehelper_exec); @@ -459,5 +517,5 @@ void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); BUG_ON(!khelper_wq); - pm_notifier(usermodehelper_pm_callback, 0); + register_pm_notifier_callback(); } -- cgit v1.1 From 6c961dfb7c903cfd1cd71b506863894038fd704f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 19 Jul 2007 01:47:38 -0700 Subject: PM: Reduce code duplication between main.c and user.c The SNAPSHOT_S2RAM ioctl code is outdated and it should not duplicate the suspend code in kernel/power/main.c. Fix that. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/main.c | 99 +++++++++++++++++++++++++++++----------------------- kernel/power/power.h | 3 +- kernel/power/user.c | 38 +++----------------- 3 files changed, 62 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 4d26ad3..32147b5 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -65,14 +65,11 @@ static inline void pm_finish(suspend_state_t state) /** * suspend_prepare - Do prep work before entering low-power state. - * @state: State we're entering. * - * This is common code that is called for each state that we're - * entering. Allocate a console, stop all processes, then make sure - * the platform can enter the requested state. + * This is common code that is called for each state that we're entering. + * Run suspend notifiers, allocate a console and stop all processes. */ - -static int suspend_prepare(suspend_state_t state) +static int suspend_prepare(void) { int error; unsigned int free_pages; @@ -91,43 +88,18 @@ static int suspend_prepare(suspend_state_t state) goto Thaw; } - if ((free_pages = global_page_state(NR_FREE_PAGES)) - < FREE_PAGE_NUMBER) { + free_pages = global_page_state(NR_FREE_PAGES); + if (free_pages < FREE_PAGE_NUMBER) { pr_debug("PM: free some memory\n"); shrink_all_memory(FREE_PAGE_NUMBER - free_pages); if (nr_free_pages() < FREE_PAGE_NUMBER) { error = -ENOMEM; printk(KERN_ERR "PM: No enough memory\n"); - goto Thaw; } } - - if (pm_ops->set_target) { - error = pm_ops->set_target(state); - if (error) - goto Thaw; - } - suspend_console(); - error = device_suspend(PMSG_SUSPEND); - if (error) { - printk(KERN_ERR "Some devices failed to suspend\n"); - goto Resume_console; - } - if (pm_ops->prepare) { - if ((error = pm_ops->prepare(state))) - goto Resume_devices; - } - - error = disable_nonboot_cpus(); if (!error) return 0; - enable_nonboot_cpus(); - pm_finish(state); - Resume_devices: - device_resume(); - Resume_console: - resume_console(); Thaw: thaw_processes(); pm_restore_console(); @@ -148,6 +120,12 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void) local_irq_enable(); } +/** + * suspend_enter - enter the desired system sleep state. + * @state: state to enter + * + * This function should be called after devices have been suspended. + */ int suspend_enter(suspend_state_t state) { int error = 0; @@ -167,21 +145,55 @@ int suspend_enter(suspend_state_t state) return error; } +/** + * suspend_devices_and_enter - suspend devices and enter the desired system sleep + * state. + * @state: state to enter + */ +int suspend_devices_and_enter(suspend_state_t state) +{ + int error; + + if (!pm_ops) + return -ENOSYS; + + if (pm_ops->set_target) { + error = pm_ops->set_target(state); + if (error) + return error; + } + suspend_console(); + error = device_suspend(PMSG_SUSPEND); + if (error) { + printk(KERN_ERR "Some devices failed to suspend\n"); + goto Resume_console; + } + if (pm_ops->prepare) { + error = pm_ops->prepare(state); + if (error) + goto Resume_devices; + } + error = disable_nonboot_cpus(); + if (!error) + suspend_enter(state); + + enable_nonboot_cpus(); + pm_finish(state); + Resume_devices: + device_resume(); + Resume_console: + resume_console(); + return error; +} /** * suspend_finish - Do final work before exiting suspend sequence. - * @state: State we're coming out of. * * Call platform code to clean up, restart processes, and free the * console that we've allocated. This is not called for suspend-to-disk. */ - -static void suspend_finish(suspend_state_t state) +static void suspend_finish(void) { - enable_nonboot_cpus(); - pm_finish(state); - device_resume(); - resume_console(); thaw_processes(); pm_restore_console(); pm_notifier_call_chain(PM_POST_SUSPEND); @@ -216,7 +228,6 @@ static inline int valid_state(suspend_state_t state) * Then, do the setup for suspend, enter the state, and cleaup (after * we've woken up). */ - static int enter_state(suspend_state_t state) { int error; @@ -227,14 +238,14 @@ static int enter_state(suspend_state_t state) return -EBUSY; pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); - if ((error = suspend_prepare(state))) + if ((error = suspend_prepare())) goto Unlock; pr_debug("PM: Entering %s sleep\n", pm_states[state]); - error = suspend_enter(state); + error = suspend_devices_and_enter(state); pr_debug("PM: Finishing wakeup.\n"); - suspend_finish(state); + suspend_finish(); Unlock: mutex_unlock(&pm_mutex); return error; diff --git a/kernel/power/power.h b/kernel/power/power.h index 01c2275..5f24c78 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -170,7 +170,6 @@ extern int swsusp_resume(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); extern void swsusp_close(void); -extern int suspend_enter(suspend_state_t state); struct timeval; /* kernel/power/swsusp.c */ @@ -178,6 +177,8 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, unsigned int, char *); /* kernel/power/main.c */ +extern int suspend_enter(suspend_state_t state); +extern int suspend_devices_and_enter(suspend_state_t state); extern struct blocking_notifier_head pm_chain_head; static inline int pm_notifier_call_chain(unsigned long val) diff --git a/kernel/power/user.c b/kernel/power/user.c index 7f19afe..bd0723a 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -255,47 +255,19 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; case SNAPSHOT_S2RAM: - if (!pm_ops) { - error = -ENOSYS; - break; - } - if (!data->frozen) { error = -EPERM; break; } - if (!mutex_trylock(&pm_mutex)) { error = -EBUSY; break; } - - if (pm_ops->prepare) { - error = pm_ops->prepare(PM_SUSPEND_MEM); - if (error) - goto OutS3; - } - - /* Put devices to sleep */ - suspend_console(); - error = device_suspend(PMSG_SUSPEND); - if (error) { - printk(KERN_ERR "Failed to suspend some devices.\n"); - } else { - error = disable_nonboot_cpus(); - if (!error) { - /* Enter S3, system is already frozen */ - suspend_enter(PM_SUSPEND_MEM); - enable_nonboot_cpus(); - } - /* Wake up devices */ - device_resume(); - } - resume_console(); - if (pm_ops->finish) - pm_ops->finish(PM_SUSPEND_MEM); - - OutS3: + /* + * Tasks are frozen and the notifiers have been called with + * PM_HIBERNATION_PREPARE + */ + error = suspend_devices_and_enter(PM_SUSPEND_MEM); mutex_unlock(&pm_mutex); break; -- cgit v1.1 From bd804eba1c8597cbb7cd5a5f9fe886aae16a079a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 19 Jul 2007 01:47:40 -0700 Subject: PM: Introduce pm_power_off_prepare Introduce the pm_power_off_prepare() callback that can be registered by the interested platforms in analogy with pm_idle() and pm_power_off(), used for preparing the system to power off (needed by ACPI). This allows us to drop acpi_sysclass and device_acpi that are only defined in order to register the ACPI power off preparation callback, which is needed by pm_power_off() registered in a much different way. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 18987c7..d40e40a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -100,6 +100,13 @@ struct pid *cad_pid; EXPORT_SYMBOL(cad_pid); /* + * If set, this is used for preparing the system to power off. + */ + +void (*pm_power_off_prepare)(void); +EXPORT_SYMBOL(pm_power_off_prepare); + +/* * Notifier list for kernel code which wants to be called * at shutdown. This is used to stop any idling DMA operations * and the like. @@ -867,6 +874,8 @@ EXPORT_SYMBOL_GPL(kernel_halt); void kernel_power_off(void) { kernel_shutdown_prepare(SYSTEM_POWER_OFF); + if (pm_power_off_prepare) + pm_power_off_prepare(); printk(KERN_EMERG "Power down.\n"); machine_power_off(); } -- cgit v1.1 From 5a60d6235c8352ade8f2699e72fcdfe853730456 Mon Sep 17 00:00:00 2001 From: Nigel Cunningham Date: Thu, 19 Jul 2007 01:47:41 -0700 Subject: PM: Optional beeping during resume from suspend to RAM Add a feature allowing the user to make the system beep during a resume from suspend to RAM, on x86_64 and i386. This is useful for the users with broken resume from RAM, so that they can verify if the control reaches the kernel after a wake-up event. Signed-off-by: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/main.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 32147b5..c74a564 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -332,6 +332,27 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n) power_attr(state); +unsigned long s2ram_beep = 0; + +static ssize_t s2ram_beep_show(struct kset *kset, char *buf) +{ + return sprintf(buf, "%d\n", s2ram_beep); +} + +static ssize_t +s2ram_beep_store(struct kset *kset, const char *buf, size_t n) +{ + int val; + + if (sscanf(buf, "%d", &val) > 0) { + s2ram_beep = val; + return n; + } + return -EINVAL; +} + +power_attr(s2ram_beep); + #ifdef CONFIG_PM_TRACE int pm_trace_enabled; @@ -357,11 +378,13 @@ power_attr(pm_trace); static struct attribute * g[] = { &state_attr.attr, &pm_trace_attr.attr, + &s2ram_beep_attr.attr, NULL, }; #else static struct attribute * g[] = { &state_attr.attr, + &s2ram_beep_attr.attr, NULL, }; #endif /* CONFIG_PM_TRACE */ -- cgit v1.1 From 77afcf78a2ded9a91838734234949c0ead5feb12 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Thu, 19 Jul 2007 01:47:41 -0700 Subject: PM: Integrate beeping flag with existing acpi_sleep flags Move "debug during resume from s2ram" into the variable we already use for real-mode flags to simplify code. It also closes nasty trap for the user in acpi_sleep_setup; order of parameters actually mattered there, acpi_sleep=s3_bios,s3_mode doing something different from acpi_sleep=s3_mode,s3_bios. Signed-off-by: Pavel Machek Signed-off-by: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/main.c | 23 ----------------------- kernel/sysctl.c | 2 +- 2 files changed, 1 insertion(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index c74a564..32147b5 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -332,27 +332,6 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n) power_attr(state); -unsigned long s2ram_beep = 0; - -static ssize_t s2ram_beep_show(struct kset *kset, char *buf) -{ - return sprintf(buf, "%d\n", s2ram_beep); -} - -static ssize_t -s2ram_beep_store(struct kset *kset, const char *buf, size_t n) -{ - int val; - - if (sscanf(buf, "%d", &val) > 0) { - s2ram_beep = val; - return n; - } - return -EINVAL; -} - -power_attr(s2ram_beep); - #ifdef CONFIG_PM_TRACE int pm_trace_enabled; @@ -378,13 +357,11 @@ power_attr(pm_trace); static struct attribute * g[] = { &state_attr.attr, &pm_trace_attr.attr, - &s2ram_beep_attr.attr, NULL, }; #else static struct attribute * g[] = { &state_attr.attr, - &s2ram_beep_attr.attr, NULL, }; #endif /* CONFIG_PM_TRACE */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 44a1d69..3ed4912b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -660,7 +660,7 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_ACPI_VIDEO_FLAGS, .procname = "acpi_video_flags", - .data = &acpi_video_flags, + .data = &acpi_realmode_flags, .maxlen = sizeof (unsigned long), .mode = 0644, .proc_handler = &proc_doulongvec_minmax, -- cgit v1.1 From 3d7e33825d8799115dd2495c9944badd3272a623 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 19 Jul 2007 01:48:11 -0700 Subject: jprobes: make jprobes a little safer for users I realise jprobes are a razor-blades-included type of interface, but that doesn't mean we can't try and make them safer to use. This guy I know once wrote code like this: struct jprobe jp = { .kp.symbol_name = "foo", .entry = "jprobe_foo" }; And then his kernel exploded. Oops. This patch adds an arch hook, arch_deref_entry_point() (I don't like it either) which takes the void * in a struct jprobe, and gives back the text address that it represents. We can then use that in register_jprobe() to check that the entry point we're passed is actually in the kernel text, rather than just some random value. Signed-off-by: Michael Ellerman Cc: Prasanna S Panchamukhi Acked-by: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9e47d8c..3e9f513 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -675,9 +675,18 @@ static struct notifier_block kprobe_exceptions_nb = { .priority = 0x7fffffff /* we need to be notified first */ }; +unsigned long __weak arch_deref_entry_point(void *entry) +{ + return (unsigned long)entry; +} int __kprobes register_jprobe(struct jprobe *jp) { + unsigned long addr = arch_deref_entry_point(jp->entry); + + if (!kernel_text_address(addr)) + return -EINVAL; + /* Todo: Verify probepoint is a function entry point */ jp->kp.pre_handler = setjmp_pre_handler; jp->kp.break_handler = longjmp_break_handler; -- cgit v1.1 From f34e3b61f2be9628bd41244f3ecc42009c5eced5 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 19 Jul 2007 01:48:13 -0700 Subject: use the new percpu interface for shared data Currently most of the per cpu data, which is accessed by different cpus, has a ____cacheline_aligned_in_smp attribute. Move all this data to the new per cpu shared data section: .data.percpu.shared_aligned. This will seperate the percpu data which is referenced frequently by other cpus from the local only percpu data. Signed-off-by: Fenghua Yu Acked-by: Suresh Siddha Cc: Rusty Russell Cc: Christoph Lameter Cc: "Luck, Tony" Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index cb31fb4..645256b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -301,7 +301,7 @@ struct rq { struct lock_class_key rq_lock_key; }; -static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static DEFINE_MUTEX(sched_hotcpu_mutex); static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) -- cgit v1.1 From bdf4c48af20a3b0f01671799ace345e3d49576da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jul 2007 01:48:15 -0700 Subject: audit: rework execve audit The purpose of audit_bprm() is to log the argv array to a userspace daemon at the end of the execve system call. Since user-space hasn't had time to run, this array is still in pristine state on the process' stack; so no need to copy it, we can just grab it from there. In order to minimize the damage to audit_log_*() copy each string into a temporary kernel buffer first. Currently the audit code requires that the full argument vector fits in a single packet. So currently it does clip the argv size to a (sysctl) limit, but only when execve auditing is enabled. If the audit protocol gets extended to allow for multiple packets this check can be removed. Signed-off-by: Peter Zijlstra Signed-off-by: Ollie Wild Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 84 ++++++++++++++++++++++++++++++++++++++++++-------------- kernel/sysctl.c | 11 ++++++++ 2 files changed, 74 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b7640a5..535586f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -153,7 +153,7 @@ struct audit_aux_data_execve { struct audit_aux_data d; int argc; int envc; - char mem[0]; + struct mm_struct *mm; }; struct audit_aux_data_socketcall { @@ -831,6 +831,55 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, return rc; } +static void audit_log_execve_info(struct audit_buffer *ab, + struct audit_aux_data_execve *axi) +{ + int i; + long len, ret; + const char __user *p = (const char __user *)axi->mm->arg_start; + char *buf; + + if (axi->mm != current->mm) + return; /* execve failed, no additional info */ + + for (i = 0; i < axi->argc; i++, p += len) { + len = strnlen_user(p, MAX_ARG_PAGES*PAGE_SIZE); + /* + * We just created this mm, if we can't find the strings + * we just copied into it something is _very_ wrong. Similar + * for strings that are too long, we should not have created + * any. + */ + if (!len || len > MAX_ARG_STRLEN) { + WARN_ON(1); + send_sig(SIGKILL, current, 0); + } + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) { + audit_panic("out of memory for argv string\n"); + break; + } + + ret = copy_from_user(buf, p, len); + /* + * There is no reason for this copy to be short. We just + * copied them here, and the mm hasn't been exposed to user- + * space yet. + */ + if (!ret) { + WARN_ON(1); + send_sig(SIGKILL, current, 0); + } + + audit_log_format(ab, "a%d=", i); + audit_log_untrustedstring(ab, buf); + audit_log_format(ab, "\n"); + + kfree(buf); + } +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { int i, call_panic = 0; @@ -971,13 +1020,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; - int i; - const char *p; - for (i = 0, p = axi->mem; i < axi->argc; i++) { - audit_log_format(ab, "a%d=", i); - p = audit_log_untrustedstring(ab, p); - audit_log_format(ab, "\n"); - } + audit_log_execve_info(ab, axi); break; } case AUDIT_SOCKETCALL: { @@ -1821,32 +1864,31 @@ int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode return 0; } +int audit_argv_kb = 32; + int audit_bprm(struct linux_binprm *bprm) { struct audit_aux_data_execve *ax; struct audit_context *context = current->audit_context; - unsigned long p, next; - void *to; if (likely(!audit_enabled || !context || context->dummy)) return 0; - ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, - GFP_KERNEL); + /* + * Even though the stack code doesn't limit the arg+env size any more, + * the audit code requires that _all_ arguments be logged in a single + * netlink skb. Hence cap it :-( + */ + if (bprm->argv_len > (audit_argv_kb << 10)) + return -E2BIG; + + ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) return -ENOMEM; ax->argc = bprm->argc; ax->envc = bprm->envc; - for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) { - struct page *page = bprm->page[p / PAGE_SIZE]; - void *kaddr = kmap(page); - next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1); - memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p); - to += next - p; - kunmap(page); - } - + ax->mm = bprm->mm; ax->d.type = AUDIT_EXECVE; ax->d.next = context->aux; context->aux = (void *)ax; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3ed4912b..8db4176 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -78,6 +78,7 @@ extern int percpu_pagelist_fraction; extern int compat_log; extern int maps_protect; extern int sysctl_stat_interval; +extern int audit_argv_kb; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -306,6 +307,16 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_AUDITSYSCALL + { + .ctl_name = CTL_UNNUMBERED, + .procname = "audit_argv_kb", + .data = &audit_argv_kb, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_CORE_PATTERN, .procname = "core_pattern", -- cgit v1.1 From b6a2fea39318e43fee84fa7b0b90d68bed92d2ba Mon Sep 17 00:00:00 2001 From: Ollie Wild Date: Thu, 19 Jul 2007 01:48:16 -0700 Subject: mm: variable length argument support Remove the arg+env limit of MAX_ARG_PAGES by copying the strings directly from the old mm into the new mm. We create the new mm before the binfmt code runs, and place the new stack at the very top of the address space. Once the binfmt code runs and figures out where the stack should be, we move it downwards. It is a bit peculiar in that we have one task with two mm's, one of which is inactive. [a.p.zijlstra@chello.nl: limit stack size] Signed-off-by: Ollie Wild Signed-off-by: Peter Zijlstra Cc: Cc: Hugh Dickins [bunk@stusta.de: unexport bprm_mm_init] Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 535586f..145cbb7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -843,7 +843,7 @@ static void audit_log_execve_info(struct audit_buffer *ab, return; /* execve failed, no additional info */ for (i = 0; i < axi->argc; i++, p += len) { - len = strnlen_user(p, MAX_ARG_PAGES*PAGE_SIZE); + len = strnlen_user(p, MAX_ARG_STRLEN); /* * We just created this mm, if we can't find the strings * we just copied into it something is _very_ wrong. Similar -- cgit v1.1 From 76fdbb25f963de5dc1e308325f0578a2f92b1c2d Mon Sep 17 00:00:00 2001 From: "Kawai, Hidehiro" Date: Thu, 19 Jul 2007 01:48:26 -0700 Subject: coredump masking: bound suid_dumpable sysctl This patch series is version 5 of the core dump masking feature, which controls which VMAs should be dumped based on their memory types and per-process flags. I adopted most of Andrew's suggestion at the previous version. He also suggested using system call instead of /proc// interface, I decided to use the latter continuously because adding new system call with pid argument will give a big impact on the kernel. You can access the per-process flags via /proc//coredump_filter interface. coredump_filter represents a bitmask of memory types, and if a bit is set, VMAs of corresponding memory type are written into a core file when the process is dumped. The bitmask is inherited from the parent process when a process is created. The original purpose is to avoid longtime system slowdown when a number of processes which share a huge shared memory are dumped at the same time. To achieve this purpose, this patch series adds an ability to suppress dumping anonymous shared memory for specified processes. In this version, three other memory types are also supported. Here are the coredump_filter bits: bit 0: anonymous private memory bit 1: anonymous shared memory bit 2: file-backed private memory bit 3: file-backed shared memory The default value of coredump_filter is 0x3. This means the new core dump routine has the same behavior as conventional behavior by default. In this version, coredump_filter bits and mm.dumpable are merged into mm.flags, and it is accessed by atomic bitops. The supported core file formats are ELF and ELF-FDPIC. ELF has been tested, but ELF-FDPIC has not been built and tested because I don't have the test environment. This patch limits a value of suid_dumpable sysctl to the range of 0 to 2. Signed-off-by: Hidehiro Kawai Cc: Alan Cox Cc: David Howells Cc: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8db4176..2aaa3f9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -733,6 +733,7 @@ static ctl_table kern_table[] = { /* Constants for minimum and maximum testing in vm_table. We use these as one-element integer vectors. */ static int zero; +static int two = 2; static int one_hundred = 100; @@ -1123,7 +1124,10 @@ static ctl_table fs_table[] = { .data = &lease_break_time, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &two, }, { .ctl_name = FS_AIO_NR, -- cgit v1.1 From 6c5d523826dc639df709ed0f88c5d2ce25379652 Mon Sep 17 00:00:00 2001 From: "Kawai, Hidehiro" Date: Thu, 19 Jul 2007 01:48:27 -0700 Subject: coredump masking: reimplementation of dumpable using two flags This patch changes mm_struct.dumpable to a pair of bit flags. set_dumpable() converts three-value dumpable to two flags and stores it into lower two bits of mm_struct.flags instead of mm_struct.dumpable. get_dumpable() behaves in the opposite way. [akpm@linux-foundation.org: export set_dumpable] Signed-off-by: Hidehiro Kawai Cc: Alan Cox Cc: David Howells Cc: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 2 +- kernel/sys.c | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4a1745f1..82a558b 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -142,7 +142,7 @@ static int may_attach(struct task_struct *task) return -EPERM; smp_rmb(); if (task->mm) - dumpable = task->mm->dumpable; + dumpable = get_dumpable(task->mm); if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; diff --git a/kernel/sys.c b/kernel/sys.c index d40e40a..08562f4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1036,7 +1036,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) return -EPERM; } if (new_egid != old_egid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } if (rgid != (gid_t) -1 || @@ -1066,13 +1066,13 @@ asmlinkage long sys_setgid(gid_t gid) if (capable(CAP_SETGID)) { if (old_egid != gid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->gid = current->egid = current->sgid = current->fsgid = gid; } else if ((gid == current->gid) || (gid == current->sgid)) { if (old_egid != gid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->egid = current->fsgid = gid; @@ -1103,7 +1103,7 @@ static int set_user(uid_t new_ruid, int dumpclear) switch_uid(new_user); if (dumpclear) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->uid = new_ruid; @@ -1159,7 +1159,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) return -EAGAIN; if (new_euid != old_euid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->fsuid = current->euid = new_euid; @@ -1209,7 +1209,7 @@ asmlinkage long sys_setuid(uid_t uid) return -EPERM; if (old_euid != uid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->fsuid = current->euid = uid; @@ -1254,7 +1254,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) } if (euid != (uid_t) -1) { if (euid != current->euid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->euid = euid; @@ -1304,7 +1304,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) } if (egid != (gid_t) -1) { if (egid != current->egid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->egid = egid; @@ -1350,7 +1350,7 @@ asmlinkage long sys_setfsuid(uid_t uid) uid == current->suid || uid == current->fsuid || capable(CAP_SETUID)) { if (uid != old_fsuid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->fsuid = uid; @@ -1379,7 +1379,7 @@ asmlinkage long sys_setfsgid(gid_t gid) gid == current->sgid || gid == current->fsgid || capable(CAP_SETGID)) { if (gid != old_fsgid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->fsgid = gid; @@ -2176,14 +2176,14 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = put_user(current->pdeath_signal, (int __user *)arg2); break; case PR_GET_DUMPABLE: - error = current->mm->dumpable; + error = get_dumpable(current->mm); break; case PR_SET_DUMPABLE: if (arg2 < 0 || arg2 > 1) { error = -EINVAL; break; } - current->mm->dumpable = arg2; + set_dumpable(current->mm, arg2); break; case PR_SET_UNALIGN: -- cgit v1.1 From 3cb4a0bb1e773e3c41800b33a3f7dab32bd06c64 Mon Sep 17 00:00:00 2001 From: "Kawai, Hidehiro" Date: Thu, 19 Jul 2007 01:48:28 -0700 Subject: coredump masking: add an interface for core dump filter This patch adds an interface to set/reset flags which determines each memory segment should be dumped or not when a core file is generated. /proc//coredump_filter file is provided to access the flags. You can change the flag status for a particular process by writing to or reading from the file. The flag status is inherited to the child process when it is created. Signed-off-by: Hidehiro Kawai Cc: Alan Cox Cc: David Howells Cc: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ba39bdb..4698389 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -334,6 +334,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm) atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); + mm->flags = (current->mm) ? current->mm->flags + : MMF_DUMP_FILTER_DEFAULT; mm->core_waiters = 0; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); -- cgit v1.1 From 01c55ed3260e130f152b7fbab2e18f23980b59a4 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 19 Jul 2007 01:48:32 -0700 Subject: kernel/relay.c: make functions static Signed-off-by: Adrian Bunk Cc: Tom Zanussi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index a615a8f..510fbbd 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -80,7 +80,7 @@ static struct vm_operations_struct relay_file_mmap_ops = { * * Caller should already have grabbed mmap_sem. */ -int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) +static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) { unsigned long length = vma->vm_end - vma->vm_start; struct file *filp = vma->vm_file; @@ -145,7 +145,7 @@ depopulate: * * Returns channel buffer if successful, %NULL otherwise. */ -struct rchan_buf *relay_create_buf(struct rchan *chan) +static struct rchan_buf *relay_create_buf(struct rchan *chan) { struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); if (!buf) @@ -175,7 +175,7 @@ free_buf: * * Should only be called from kref_put(). */ -void relay_destroy_channel(struct kref *kref) +static void relay_destroy_channel(struct kref *kref) { struct rchan *chan = container_of(kref, struct rchan, kref); kfree(chan); @@ -185,7 +185,7 @@ void relay_destroy_channel(struct kref *kref) * relay_destroy_buf - destroy an rchan_buf struct and associated buffer * @buf: the buffer struct */ -void relay_destroy_buf(struct rchan_buf *buf) +static void relay_destroy_buf(struct rchan_buf *buf) { struct rchan *chan = buf->chan; unsigned int i; @@ -210,7 +210,7 @@ void relay_destroy_buf(struct rchan_buf *buf) * rchan_buf_struct and the channel buffer. Should only be called from * kref_put(). */ -void relay_remove_buf(struct kref *kref) +static void relay_remove_buf(struct kref *kref) { struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); buf->chan->cb->remove_buf_file(buf->dentry); @@ -223,11 +223,10 @@ void relay_remove_buf(struct kref *kref) * * Returns 1 if the buffer is empty, 0 otherwise. */ -int relay_buf_empty(struct rchan_buf *buf) +static int relay_buf_empty(struct rchan_buf *buf) { return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; } -EXPORT_SYMBOL_GPL(relay_buf_empty); /** * relay_buf_full - boolean, is the channel buffer full? -- cgit v1.1 From da1a679cde9b12d6e331f43d2d92a234f2d1f9b0 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 19 Jul 2007 01:48:39 -0700 Subject: Add /sys/kernel/notes This patch adds the /sys/kernel/notes magic file. Reading this delivers the contents of the kernel's .notes section. This lets userland easily glean any detailed information about the running kernel's build that was stored there at compile time. Signed-off-by: Roland McGrath Cc: Andi Kleen Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ksysfs.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 559deca..2565e1b 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -62,6 +62,28 @@ static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page) KERNEL_ATTR_RO(kexec_crash_loaded); #endif /* CONFIG_KEXEC */ +/* + * Make /sys/kernel/notes give the raw contents of our kernel .notes section. + */ +extern const char __start_notes __attribute__((weak)); +extern const char __stop_notes __attribute__((weak)); +#define notes_size (&__stop_notes - &__start_notes) + +static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) +{ + memcpy(buf, &__start_notes + off, count); + return count; +} + +static struct bin_attribute notes_attr = { + .attr = { + .name = "notes", + .mode = S_IRUGO, + }, + .read = ¬es_read, +}; + decl_subsys(kernel, NULL, NULL); EXPORT_SYMBOL_GPL(kernel_subsys); @@ -88,6 +110,12 @@ static int __init ksysfs_init(void) error = sysfs_create_group(&kernel_subsys.kobj, &kernel_attr_group); + if (!error && notes_size > 0) { + notes_attr.size = notes_size; + error = sysfs_create_bin_file(&kernel_subsys.kobj, + ¬es_attr); + } + return error; } -- cgit v1.1 From ca58abcb4a6d52ee2db1b1130cea3ca2a76677b9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jul 2007 01:48:53 -0700 Subject: lockdep: sanitise CONFIG_PROVE_LOCKING Ensure that all of the lock dependency tracking code is under CONFIG_PROVE_LOCKING. This allows us to use the held lock tracking code for other purposes. Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Acked-by: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 13 ++++++++++++- kernel/spinlock.c | 4 ++-- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index edba2ff..05c1261 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -95,6 +95,7 @@ static int lockdep_initialized; unsigned long nr_list_entries; static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; +#ifdef CONFIG_PROVE_LOCKING /* * Allocate a lockdep entry. (assumes the graph_lock held, returns * with NULL on failure) @@ -111,6 +112,7 @@ static struct lock_list *alloc_list_entry(void) } return list_entries + nr_list_entries++; } +#endif /* * All data structures here are protected by the global debug_lock. @@ -140,7 +142,9 @@ LIST_HEAD(all_lock_classes); static struct list_head classhash_table[CLASSHASH_SIZE]; unsigned long nr_lock_chains; +#ifdef CONFIG_PROVE_LOCKING static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; +#endif /* * We put the lock dependency chains into a hash-table as well, to cache @@ -482,6 +486,7 @@ static void print_lock_dependencies(struct lock_class *class, int depth) } } +#ifdef CONFIG_PROVE_LOCKING /* * Add a new dependency to the head of the list: */ @@ -541,6 +546,7 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth) return 0; } +#endif static void print_kernel_version(void) { @@ -549,6 +555,7 @@ static void print_kernel_version(void) init_utsname()->version); } +#ifdef CONFIG_PROVE_LOCKING /* * When a circular dependency is detected, print the * header first: @@ -639,6 +646,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) } return 1; } +#endif static int very_verbose(struct lock_class *class) { @@ -823,6 +831,7 @@ check_usage(struct task_struct *curr, struct held_lock *prev, #endif +#ifdef CONFIG_PROVE_LOCKING static int print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) @@ -1087,7 +1096,7 @@ out_bug: return 0; } - +#endif /* * Is this the address of a static object: @@ -1307,6 +1316,7 @@ out_unlock_set: return class; } +#ifdef CONFIG_PROVE_LOCKING /* * Look up a dependency chain. If the key is not present yet then * add it and return 1 - in this case the new dependency chain is @@ -1381,6 +1391,7 @@ cache_hit: return 1; } +#endif /* * We are building curr_chain_key incrementally, so double-check diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 2c6c2bf..cd93bfe 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -88,7 +88,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) * _raw_spin_lock_flags() code, because lockdep assumes * that interrupts are not re-enabled during lock-acquire: */ -#ifdef CONFIG_PROVE_LOCKING +#ifdef CONFIG_LOCKDEP _raw_spin_lock(lock); #else _raw_spin_lock_flags(lock, &flags); @@ -305,7 +305,7 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas * _raw_spin_lock_flags() code, because lockdep assumes * that interrupts are not re-enabled during lock-acquire: */ -#ifdef CONFIG_PROVE_SPIN_LOCKING +#ifdef CONFIG_LOCKDEP _raw_spin_lock(lock); #else _raw_spin_lock_flags(lock, &flags); -- cgit v1.1 From 8e18257d29238311e82085152741f0c3aa18b74d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jul 2007 01:48:54 -0700 Subject: lockdep: reduce the ifdeffery Move code around to get fewer but larger #ifdef sections. Break some in-function #ifdefs out into their own functions. Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 1657 ++++++++++++++++++++++++++----------------------- kernel/lockdep_proc.c | 2 + 2 files changed, 873 insertions(+), 786 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 05c1261..87ac364 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -95,25 +95,6 @@ static int lockdep_initialized; unsigned long nr_list_entries; static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; -#ifdef CONFIG_PROVE_LOCKING -/* - * Allocate a lockdep entry. (assumes the graph_lock held, returns - * with NULL on failure) - */ -static struct lock_list *alloc_list_entry(void) -{ - if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { - if (!debug_locks_off_graph_unlock()) - return NULL; - - printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); - printk("turning off the locking correctness validator.\n"); - return NULL; - } - return list_entries + nr_list_entries++; -} -#endif - /* * All data structures here are protected by the global debug_lock. * @@ -141,11 +122,6 @@ LIST_HEAD(all_lock_classes); static struct list_head classhash_table[CLASSHASH_SIZE]; -unsigned long nr_lock_chains; -#ifdef CONFIG_PROVE_LOCKING -static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; -#endif - /* * We put the lock dependency chains into a hash-table as well, to cache * their existence: @@ -227,26 +203,6 @@ static int verbose(struct lock_class *class) return 0; } -#ifdef CONFIG_TRACE_IRQFLAGS - -static int hardirq_verbose(struct lock_class *class) -{ -#if HARDIRQ_VERBOSE - return class_filter(class); -#endif - return 0; -} - -static int softirq_verbose(struct lock_class *class) -{ -#if SOFTIRQ_VERBOSE - return class_filter(class); -#endif - return 0; -} - -#endif - /* * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the graph_lock. @@ -486,151 +442,392 @@ static void print_lock_dependencies(struct lock_class *class, int depth) } } -#ifdef CONFIG_PROVE_LOCKING +static void print_kernel_version(void) +{ + printk("%s %.*s\n", init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); +} + +static int very_verbose(struct lock_class *class) +{ +#if VERY_VERBOSE + return class_filter(class); +#endif + return 0; +} + /* - * Add a new dependency to the head of the list: + * Is this the address of a static object: */ -static int add_lock_to_list(struct lock_class *class, struct lock_class *this, - struct list_head *head, unsigned long ip, int distance) +static int static_obj(void *obj) { - struct lock_list *entry; + unsigned long start = (unsigned long) &_stext, + end = (unsigned long) &_end, + addr = (unsigned long) obj; +#ifdef CONFIG_SMP + int i; +#endif + /* - * Lock not present yet - get a new dependency struct and - * add it to the list: + * static variable? */ - entry = alloc_list_entry(); - if (!entry) - return 0; - - entry->class = this; - entry->distance = distance; - if (!save_trace(&entry->trace)) - return 0; + if ((addr >= start) && (addr < end)) + return 1; +#ifdef CONFIG_SMP /* - * Since we never remove from the dependency list, the list can - * be walked lockless by other CPUs, it's only allocation - * that must be protected by the spinlock. But this also means - * we must make new entries visible only once writes to the - * entry become visible - hence the RCU op: + * percpu var? */ - list_add_tail_rcu(&entry->entry, head); - - return 1; -} - -/* - * Recursive, forwards-direction lock-dependency checking, used for - * both noncyclic checking and for hardirq-unsafe/softirq-unsafe - * checking. - * - * (to keep the stackframe of the recursive functions small we - * use these global variables, and we also mark various helper - * functions as noinline.) - */ -static struct held_lock *check_source, *check_target; - -/* - * Print a dependency chain entry (this is only done when a deadlock - * has been detected): - */ -static noinline int -print_circular_bug_entry(struct lock_list *target, unsigned int depth) -{ - if (debug_locks_silent) - return 0; - printk("\n-> #%u", depth); - print_lock_name(target->class); - printk(":\n"); - print_stack_trace(&target->trace, 6); + for_each_possible_cpu(i) { + start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); + end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM + + per_cpu_offset(i); - return 0; -} + if ((addr >= start) && (addr < end)) + return 1; + } #endif -static void print_kernel_version(void) -{ - printk("%s %.*s\n", init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); + /* + * module var? + */ + return is_module_address(addr); } -#ifdef CONFIG_PROVE_LOCKING /* - * When a circular dependency is detected, print the - * header first: + * To make lock name printouts unique, we calculate a unique + * class->name_version generation counter: */ -static noinline int -print_circular_bug_header(struct lock_list *entry, unsigned int depth) +static int count_matching_names(struct lock_class *new_class) { - struct task_struct *curr = current; + struct lock_class *class; + int count = 0; - if (!debug_locks_off_graph_unlock() || debug_locks_silent) + if (!new_class->name) return 0; - printk("\n=======================================================\n"); - printk( "[ INFO: possible circular locking dependency detected ]\n"); - print_kernel_version(); - printk( "-------------------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", - curr->comm, curr->pid); - print_lock(check_source); - printk("\nbut task is already holding lock:\n"); - print_lock(check_target); - printk("\nwhich lock already depends on the new lock.\n\n"); - printk("\nthe existing dependency chain (in reverse order) is:\n"); - - print_circular_bug_entry(entry, depth); + list_for_each_entry(class, &all_lock_classes, lock_entry) { + if (new_class->key - new_class->subclass == class->key) + return class->name_version; + if (class->name && !strcmp(class->name, new_class->name)) + count = max(count, class->name_version); + } - return 0; + return count + 1; } -static noinline int print_circular_bug_tail(void) +/* + * Register a lock's class in the hash-table, if the class is not present + * yet. Otherwise we look it up. We cache the result in the lock object + * itself, so actual lookup of the hash should be once per lock object. + */ +static inline struct lock_class * +look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) { - struct task_struct *curr = current; - struct lock_list this; - - if (debug_locks_silent) - return 0; - - this.class = check_source->class; - if (!save_trace(&this.trace)) - return 0; - - print_circular_bug_entry(&this, 0); + struct lockdep_subclass_key *key; + struct list_head *hash_head; + struct lock_class *class; - printk("\nother info that might help us debug this:\n\n"); - lockdep_print_held_locks(curr); +#ifdef CONFIG_DEBUG_LOCKDEP + /* + * If the architecture calls into lockdep before initializing + * the hashes then we'll warn about it later. (we cannot printk + * right now) + */ + if (unlikely(!lockdep_initialized)) { + lockdep_init(); + lockdep_init_error = 1; + } +#endif - printk("\nstack backtrace:\n"); - dump_stack(); + /* + * Static locks do not have their class-keys yet - for them the key + * is the lock object itself: + */ + if (unlikely(!lock->key)) + lock->key = (void *)lock; - return 0; -} + /* + * NOTE: the class-key must be unique. For dynamic locks, a static + * lock_class_key variable is passed in through the mutex_init() + * (or spin_lock_init()) call - which acts as the key. For static + * locks we use the lock object itself as the key. + */ + BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class)); -#define RECURSION_LIMIT 40 + key = lock->key->subkeys + subclass; -static int noinline print_infinite_recursion_bug(void) -{ - if (!debug_locks_off_graph_unlock()) - return 0; + hash_head = classhashentry(key); - WARN_ON(1); + /* + * We can walk the hash lockfree, because the hash only + * grows, and we are careful when adding entries to the end: + */ + list_for_each_entry(class, hash_head, hash_entry) + if (class->key == key) + return class; - return 0; + return NULL; } /* - * Prove that the dependency graph starting at can not - * lead to . Print an error and return 0 if it does. + * Register a lock's class in the hash-table, if the class is not present + * yet. Otherwise we look it up. We cache the result in the lock object + * itself, so actual lookup of the hash should be once per lock object. */ -static noinline int -check_noncircular(struct lock_class *source, unsigned int depth) +static inline struct lock_class * +register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { - struct lock_list *entry; - - debug_atomic_inc(&nr_cyclic_check_recursions); - if (depth > max_recursion_depth) + struct lockdep_subclass_key *key; + struct list_head *hash_head; + struct lock_class *class; + unsigned long flags; + + class = look_up_lock_class(lock, subclass); + if (likely(class)) + return class; + + /* + * Debug-check: all keys must be persistent! + */ + if (!static_obj(lock->key)) { + debug_locks_off(); + printk("INFO: trying to register non-static key.\n"); + printk("the code is fine but needs lockdep annotation.\n"); + printk("turning off the locking correctness validator.\n"); + dump_stack(); + + return NULL; + } + + key = lock->key->subkeys + subclass; + hash_head = classhashentry(key); + + raw_local_irq_save(flags); + if (!graph_lock()) { + raw_local_irq_restore(flags); + return NULL; + } + /* + * We have to do the hash-walk again, to avoid races + * with another CPU: + */ + list_for_each_entry(class, hash_head, hash_entry) + if (class->key == key) + goto out_unlock_set; + /* + * Allocate a new key from the static array, and add it to + * the hash: + */ + if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { + if (!debug_locks_off_graph_unlock()) { + raw_local_irq_restore(flags); + return NULL; + } + raw_local_irq_restore(flags); + + printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); + printk("turning off the locking correctness validator.\n"); + return NULL; + } + class = lock_classes + nr_lock_classes++; + debug_atomic_inc(&nr_unused_locks); + class->key = key; + class->name = lock->name; + class->subclass = subclass; + INIT_LIST_HEAD(&class->lock_entry); + INIT_LIST_HEAD(&class->locks_before); + INIT_LIST_HEAD(&class->locks_after); + class->name_version = count_matching_names(class); + /* + * We use RCU's safe list-add method to make + * parallel walking of the hash-list safe: + */ + list_add_tail_rcu(&class->hash_entry, hash_head); + + if (verbose(class)) { + graph_unlock(); + raw_local_irq_restore(flags); + + printk("\nnew class %p: %s", class->key, class->name); + if (class->name_version > 1) + printk("#%d", class->name_version); + printk("\n"); + dump_stack(); + + raw_local_irq_save(flags); + if (!graph_lock()) { + raw_local_irq_restore(flags); + return NULL; + } + } +out_unlock_set: + graph_unlock(); + raw_local_irq_restore(flags); + + if (!subclass || force) + lock->class_cache = class; + + if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) + return NULL; + + return class; +} + +#ifdef CONFIG_PROVE_LOCKING +/* + * Allocate a lockdep entry. (assumes the graph_lock held, returns + * with NULL on failure) + */ +static struct lock_list *alloc_list_entry(void) +{ + if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { + if (!debug_locks_off_graph_unlock()) + return NULL; + + printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); + printk("turning off the locking correctness validator.\n"); + return NULL; + } + return list_entries + nr_list_entries++; +} + +/* + * Add a new dependency to the head of the list: + */ +static int add_lock_to_list(struct lock_class *class, struct lock_class *this, + struct list_head *head, unsigned long ip, int distance) +{ + struct lock_list *entry; + /* + * Lock not present yet - get a new dependency struct and + * add it to the list: + */ + entry = alloc_list_entry(); + if (!entry) + return 0; + + entry->class = this; + entry->distance = distance; + if (!save_trace(&entry->trace)) + return 0; + + /* + * Since we never remove from the dependency list, the list can + * be walked lockless by other CPUs, it's only allocation + * that must be protected by the spinlock. But this also means + * we must make new entries visible only once writes to the + * entry become visible - hence the RCU op: + */ + list_add_tail_rcu(&entry->entry, head); + + return 1; +} + +/* + * Recursive, forwards-direction lock-dependency checking, used for + * both noncyclic checking and for hardirq-unsafe/softirq-unsafe + * checking. + * + * (to keep the stackframe of the recursive functions small we + * use these global variables, and we also mark various helper + * functions as noinline.) + */ +static struct held_lock *check_source, *check_target; + +/* + * Print a dependency chain entry (this is only done when a deadlock + * has been detected): + */ +static noinline int +print_circular_bug_entry(struct lock_list *target, unsigned int depth) +{ + if (debug_locks_silent) + return 0; + printk("\n-> #%u", depth); + print_lock_name(target->class); + printk(":\n"); + print_stack_trace(&target->trace, 6); + + return 0; +} + +/* + * When a circular dependency is detected, print the + * header first: + */ +static noinline int +print_circular_bug_header(struct lock_list *entry, unsigned int depth) +{ + struct task_struct *curr = current; + + if (!debug_locks_off_graph_unlock() || debug_locks_silent) + return 0; + + printk("\n=======================================================\n"); + printk( "[ INFO: possible circular locking dependency detected ]\n"); + print_kernel_version(); + printk( "-------------------------------------------------------\n"); + printk("%s/%d is trying to acquire lock:\n", + curr->comm, curr->pid); + print_lock(check_source); + printk("\nbut task is already holding lock:\n"); + print_lock(check_target); + printk("\nwhich lock already depends on the new lock.\n\n"); + printk("\nthe existing dependency chain (in reverse order) is:\n"); + + print_circular_bug_entry(entry, depth); + + return 0; +} + +static noinline int print_circular_bug_tail(void) +{ + struct task_struct *curr = current; + struct lock_list this; + + if (debug_locks_silent) + return 0; + + this.class = check_source->class; + if (!save_trace(&this.trace)) + return 0; + + print_circular_bug_entry(&this, 0); + + printk("\nother info that might help us debug this:\n\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +#define RECURSION_LIMIT 40 + +static int noinline print_infinite_recursion_bug(void) +{ + if (!debug_locks_off_graph_unlock()) + return 0; + + WARN_ON(1); + + return 0; +} + +/* + * Prove that the dependency graph starting at can not + * lead to . Print an error and return 0 if it does. + */ +static noinline int +check_noncircular(struct lock_class *source, unsigned int depth) +{ + struct lock_list *entry; + + debug_atomic_inc(&nr_cyclic_check_recursions); + if (depth > max_recursion_depth) max_recursion_depth = depth; if (depth >= RECURSION_LIMIT) return print_infinite_recursion_bug(); @@ -646,17 +843,8 @@ check_noncircular(struct lock_class *source, unsigned int depth) } return 1; } -#endif -static int very_verbose(struct lock_class *class) -{ -#if VERY_VERBOSE - return class_filter(class); -#endif - return 0; -} #ifdef CONFIG_TRACE_IRQFLAGS - /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency @@ -829,9 +1017,80 @@ check_usage(struct task_struct *curr, struct held_lock *prev, bit_backwards, bit_forwards, irqclass); } +static int +check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next) +{ + /* + * Prove that the new dependency does not connect a hardirq-safe + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at , and the + * forwards-subgraph starting at : + */ + if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, + LOCK_ENABLED_HARDIRQS, "hard")) + return 0; + + /* + * Prove that the new dependency does not connect a hardirq-safe-read + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at , and the + * forwards-subgraph starting at : + */ + if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, + LOCK_ENABLED_HARDIRQS, "hard-read")) + return 0; + + /* + * Prove that the new dependency does not connect a softirq-safe + * lock with a softirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at , and the + * forwards-subgraph starting at : + */ + if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, + LOCK_ENABLED_SOFTIRQS, "soft")) + return 0; + /* + * Prove that the new dependency does not connect a softirq-safe-read + * lock with a softirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at , and the + * forwards-subgraph starting at : + */ + if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ, + LOCK_ENABLED_SOFTIRQS, "soft")) + return 0; + + return 1; +} + +static void inc_chains(void) +{ + if (current->hardirq_context) + nr_hardirq_chains++; + else { + if (current->softirq_context) + nr_softirq_chains++; + else + nr_process_chains++; + } +} + +#else + +static inline int +check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next) +{ + return 1; +} + +static inline void inc_chains(void) +{ + nr_process_chains++; +} + #endif -#ifdef CONFIG_PROVE_LOCKING static int print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) @@ -931,47 +1190,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (!(check_noncircular(next->class, 0))) return print_circular_bug_tail(); -#ifdef CONFIG_TRACE_IRQFLAGS - /* - * Prove that the new dependency does not connect a hardirq-safe - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at , and the - * forwards-subgraph starting at : - */ - if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, - LOCK_ENABLED_HARDIRQS, "hard")) - return 0; - - /* - * Prove that the new dependency does not connect a hardirq-safe-read - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at , and the - * forwards-subgraph starting at : - */ - if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, - LOCK_ENABLED_HARDIRQS, "hard-read")) + if (!check_prev_add_irq(curr, prev, next)) return 0; /* - * Prove that the new dependency does not connect a softirq-safe - * lock with a softirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at , and the - * forwards-subgraph starting at : - */ - if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, - LOCK_ENABLED_SOFTIRQS, "soft")) - return 0; - /* - * Prove that the new dependency does not connect a softirq-safe-read - * lock with a softirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at , and the - * forwards-subgraph starting at : - */ - if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ, - LOCK_ENABLED_SOFTIRQS, "soft")) - return 0; -#endif - /* * For recursive read-locks we do all the dependency checks, * but we dont store read-triggered dependencies (only * write-triggered dependencies). This ensures that only the @@ -1013,310 +1235,93 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return 0; /* - * Debugging printouts: - */ - if (verbose(prev->class) || verbose(next->class)) { - graph_unlock(); - printk("\n new dependency: "); - print_lock_name(prev->class); - printk(" => "); - print_lock_name(next->class); - printk("\n"); - dump_stack(); - return graph_lock(); - } - return 1; -} - -/* - * Add the dependency to all directly-previous locks that are 'relevant'. - * The ones that are relevant are (in increasing distance from curr): - * all consecutive trylock entries and the final non-trylock entry - or - * the end of this context's lock-chain - whichever comes first. - */ -static int -check_prevs_add(struct task_struct *curr, struct held_lock *next) -{ - int depth = curr->lockdep_depth; - struct held_lock *hlock; - - /* - * Debugging checks. - * - * Depth must not be zero for a non-head lock: - */ - if (!depth) - goto out_bug; - /* - * At least two relevant locks must exist for this - * to be a head: - */ - if (curr->held_locks[depth].irq_context != - curr->held_locks[depth-1].irq_context) - goto out_bug; - - for (;;) { - int distance = curr->lockdep_depth - depth + 1; - hlock = curr->held_locks + depth-1; - /* - * Only non-recursive-read entries get new dependencies - * added: - */ - if (hlock->read != 2) { - if (!check_prev_add(curr, hlock, next, distance)) - return 0; - /* - * Stop after the first non-trylock entry, - * as non-trylock entries have added their - * own direct dependencies already, so this - * lock is connected to them indirectly: - */ - if (!hlock->trylock) - break; - } - depth--; - /* - * End of lock-stack? - */ - if (!depth) - break; - /* - * Stop the search if we cross into another context: - */ - if (curr->held_locks[depth].irq_context != - curr->held_locks[depth-1].irq_context) - break; - } - return 1; -out_bug: - if (!debug_locks_off_graph_unlock()) - return 0; - - WARN_ON(1); - - return 0; -} -#endif - -/* - * Is this the address of a static object: - */ -static int static_obj(void *obj) -{ - unsigned long start = (unsigned long) &_stext, - end = (unsigned long) &_end, - addr = (unsigned long) obj; -#ifdef CONFIG_SMP - int i; -#endif - - /* - * static variable? - */ - if ((addr >= start) && (addr < end)) - return 1; - -#ifdef CONFIG_SMP - /* - * percpu var? - */ - for_each_possible_cpu(i) { - start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); - end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM - + per_cpu_offset(i); - - if ((addr >= start) && (addr < end)) - return 1; - } -#endif - - /* - * module var? - */ - return is_module_address(addr); -} - -/* - * To make lock name printouts unique, we calculate a unique - * class->name_version generation counter: - */ -static int count_matching_names(struct lock_class *new_class) -{ - struct lock_class *class; - int count = 0; - - if (!new_class->name) - return 0; - - list_for_each_entry(class, &all_lock_classes, lock_entry) { - if (new_class->key - new_class->subclass == class->key) - return class->name_version; - if (class->name && !strcmp(class->name, new_class->name)) - count = max(count, class->name_version); - } - - return count + 1; -} - -/* - * Register a lock's class in the hash-table, if the class is not present - * yet. Otherwise we look it up. We cache the result in the lock object - * itself, so actual lookup of the hash should be once per lock object. - */ -static inline struct lock_class * -look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) -{ - struct lockdep_subclass_key *key; - struct list_head *hash_head; - struct lock_class *class; - -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * If the architecture calls into lockdep before initializing - * the hashes then we'll warn about it later. (we cannot printk - * right now) - */ - if (unlikely(!lockdep_initialized)) { - lockdep_init(); - lockdep_init_error = 1; - } -#endif - - /* - * Static locks do not have their class-keys yet - for them the key - * is the lock object itself: - */ - if (unlikely(!lock->key)) - lock->key = (void *)lock; - - /* - * NOTE: the class-key must be unique. For dynamic locks, a static - * lock_class_key variable is passed in through the mutex_init() - * (or spin_lock_init()) call - which acts as the key. For static - * locks we use the lock object itself as the key. - */ - BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class)); - - key = lock->key->subkeys + subclass; - - hash_head = classhashentry(key); - - /* - * We can walk the hash lockfree, because the hash only - * grows, and we are careful when adding entries to the end: - */ - list_for_each_entry(class, hash_head, hash_entry) - if (class->key == key) - return class; - - return NULL; -} - -/* - * Register a lock's class in the hash-table, if the class is not present - * yet. Otherwise we look it up. We cache the result in the lock object - * itself, so actual lookup of the hash should be once per lock object. - */ -static inline struct lock_class * -register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) -{ - struct lockdep_subclass_key *key; - struct list_head *hash_head; - struct lock_class *class; - unsigned long flags; - - class = look_up_lock_class(lock, subclass); - if (likely(class)) - return class; - - /* - * Debug-check: all keys must be persistent! - */ - if (!static_obj(lock->key)) { - debug_locks_off(); - printk("INFO: trying to register non-static key.\n"); - printk("the code is fine but needs lockdep annotation.\n"); - printk("turning off the locking correctness validator.\n"); + * Debugging printouts: + */ + if (verbose(prev->class) || verbose(next->class)) { + graph_unlock(); + printk("\n new dependency: "); + print_lock_name(prev->class); + printk(" => "); + print_lock_name(next->class); + printk("\n"); dump_stack(); - - return NULL; + return graph_lock(); } + return 1; +} - key = lock->key->subkeys + subclass; - hash_head = classhashentry(key); +/* + * Add the dependency to all directly-previous locks that are 'relevant'. + * The ones that are relevant are (in increasing distance from curr): + * all consecutive trylock entries and the final non-trylock entry - or + * the end of this context's lock-chain - whichever comes first. + */ +static int +check_prevs_add(struct task_struct *curr, struct held_lock *next) +{ + int depth = curr->lockdep_depth; + struct held_lock *hlock; - raw_local_irq_save(flags); - if (!graph_lock()) { - raw_local_irq_restore(flags); - return NULL; - } - /* - * We have to do the hash-walk again, to avoid races - * with another CPU: - */ - list_for_each_entry(class, hash_head, hash_entry) - if (class->key == key) - goto out_unlock_set; /* - * Allocate a new key from the static array, and add it to - * the hash: + * Debugging checks. + * + * Depth must not be zero for a non-head lock: */ - if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { - if (!debug_locks_off_graph_unlock()) { - raw_local_irq_restore(flags); - return NULL; - } - raw_local_irq_restore(flags); - - printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); - printk("turning off the locking correctness validator.\n"); - return NULL; - } - class = lock_classes + nr_lock_classes++; - debug_atomic_inc(&nr_unused_locks); - class->key = key; - class->name = lock->name; - class->subclass = subclass; - INIT_LIST_HEAD(&class->lock_entry); - INIT_LIST_HEAD(&class->locks_before); - INIT_LIST_HEAD(&class->locks_after); - class->name_version = count_matching_names(class); + if (!depth) + goto out_bug; /* - * We use RCU's safe list-add method to make - * parallel walking of the hash-list safe: + * At least two relevant locks must exist for this + * to be a head: */ - list_add_tail_rcu(&class->hash_entry, hash_head); - - if (verbose(class)) { - graph_unlock(); - raw_local_irq_restore(flags); - - printk("\nnew class %p: %s", class->key, class->name); - if (class->name_version > 1) - printk("#%d", class->name_version); - printk("\n"); - dump_stack(); + if (curr->held_locks[depth].irq_context != + curr->held_locks[depth-1].irq_context) + goto out_bug; - raw_local_irq_save(flags); - if (!graph_lock()) { - raw_local_irq_restore(flags); - return NULL; + for (;;) { + int distance = curr->lockdep_depth - depth + 1; + hlock = curr->held_locks + depth-1; + /* + * Only non-recursive-read entries get new dependencies + * added: + */ + if (hlock->read != 2) { + if (!check_prev_add(curr, hlock, next, distance)) + return 0; + /* + * Stop after the first non-trylock entry, + * as non-trylock entries have added their + * own direct dependencies already, so this + * lock is connected to them indirectly: + */ + if (!hlock->trylock) + break; } + depth--; + /* + * End of lock-stack? + */ + if (!depth) + break; + /* + * Stop the search if we cross into another context: + */ + if (curr->held_locks[depth].irq_context != + curr->held_locks[depth-1].irq_context) + break; } -out_unlock_set: - graph_unlock(); - raw_local_irq_restore(flags); - - if (!subclass || force) - lock->class_cache = class; + return 1; +out_bug: + if (!debug_locks_off_graph_unlock()) + return 0; - if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) - return NULL; + WARN_ON(1); - return class; + return 0; } -#ifdef CONFIG_PROVE_LOCKING +unsigned long nr_lock_chains; +static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; + /* * Look up a dependency chain. If the key is not present yet then * add it and return 1 - in this case the new dependency chain is @@ -1376,21 +1381,71 @@ cache_hit: chain->chain_key = chain_key; list_add_tail_rcu(&chain->entry, hash_head); debug_atomic_inc(&chain_lookup_misses); -#ifdef CONFIG_TRACE_IRQFLAGS - if (current->hardirq_context) - nr_hardirq_chains++; - else { - if (current->softirq_context) - nr_softirq_chains++; - else - nr_process_chains++; - } -#else - nr_process_chains++; -#endif + inc_chains(); + + return 1; +} + +static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, + struct held_lock *hlock, int chain_head) +{ + /* + * Trylock needs to maintain the stack of held locks, but it + * does not add new dependencies, because trylock can be done + * in any order. + * + * We look up the chain_key and do the O(N^2) check and update of + * the dependencies only if this is a new dependency chain. + * (If lookup_chain_cache() returns with 1 it acquires + * graph_lock for us) + */ + if (!hlock->trylock && (hlock->check == 2) && + lookup_chain_cache(curr->curr_chain_key, hlock->class)) { + /* + * Check whether last held lock: + * + * - is irq-safe, if this lock is irq-unsafe + * - is softirq-safe, if this lock is hardirq-unsafe + * + * And check whether the new lock's dependency graph + * could lead back to the previous lock. + * + * any of these scenarios could lead to a deadlock. If + * All validations + */ + int ret = check_deadlock(curr, hlock, lock, hlock->read); + + if (!ret) + return 0; + /* + * Mark recursive read, as we jump over it when + * building dependencies (just like we jump over + * trylock entries): + */ + if (ret == 2) + hlock->read = 2; + /* + * Add dependency only if this lock is not the head + * of the chain, and if it's not a secondary read-lock: + */ + if (!chain_head && ret != 2) + if (!check_prevs_add(curr, hlock)) + return 0; + graph_unlock(); + } else + /* after lookup_chain_cache(): */ + if (unlikely(!debug_locks)) + return 0; return 1; } +#else +static inline int validate_chain(struct task_struct *curr, + struct lockdep_map *lock, struct held_lock *hlock, + int chain_head) +{ + return 1; +} #endif /* @@ -1436,6 +1491,57 @@ static void check_chain_key(struct task_struct *curr) #endif } +static int +print_usage_bug(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) +{ + if (!debug_locks_off_graph_unlock() || debug_locks_silent) + return 0; + + printk("\n=================================\n"); + printk( "[ INFO: inconsistent lock state ]\n"); + print_kernel_version(); + printk( "---------------------------------\n"); + + printk("inconsistent {%s} -> {%s} usage.\n", + usage_str[prev_bit], usage_str[new_bit]); + + printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", + curr->comm, curr->pid, + trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, + trace_hardirqs_enabled(curr), + trace_softirqs_enabled(curr)); + print_lock(this); + + printk("{%s} state was registered at:\n", usage_str[prev_bit]); + print_stack_trace(this->class->usage_traces + prev_bit, 1); + + print_irqtrace_events(curr); + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Print out an error if an invalid bit is set: + */ +static inline int +valid_state(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) +{ + if (unlikely(this->class->usage_mask & (1 << bad_bit))) + return print_usage_bug(curr, this, bad_bit, new_bit); + return 1; +} + +static int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit); + #ifdef CONFIG_TRACE_IRQFLAGS /* @@ -1529,90 +1635,30 @@ void print_irqtrace_events(struct task_struct *curr) print_ip_sym(curr->softirq_disable_ip); } -#endif - -static int -print_usage_bug(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) +static int hardirq_verbose(struct lock_class *class) { - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - printk("\n=================================\n"); - printk( "[ INFO: inconsistent lock state ]\n"); - print_kernel_version(); - printk( "---------------------------------\n"); - - printk("inconsistent {%s} -> {%s} usage.\n", - usage_str[prev_bit], usage_str[new_bit]); - - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", - curr->comm, curr->pid, - trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, - trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, - trace_hardirqs_enabled(curr), - trace_softirqs_enabled(curr)); - print_lock(this); - - printk("{%s} state was registered at:\n", usage_str[prev_bit]); - print_stack_trace(this->class->usage_traces + prev_bit, 1); - - print_irqtrace_events(curr); - printk("\nother info that might help us debug this:\n"); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - +#if HARDIRQ_VERBOSE + return class_filter(class); +#endif return 0; } -/* - * Print out an error if an invalid bit is set: - */ -static inline int -valid_state(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) +static int softirq_verbose(struct lock_class *class) { - if (unlikely(this->class->usage_mask & (1 << bad_bit))) - return print_usage_bug(curr, this, bad_bit, new_bit); - return 1; +#if SOFTIRQ_VERBOSE + return class_filter(class); +#endif + return 0; } #define STRICT_READ_CHECKS 1 -/* - * Mark a lock with a usage bit, and validate the state transition: - */ -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) +static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) { - unsigned int new_mask = 1 << new_bit, ret = 1; - - /* - * If already set then do not dirty the cacheline, - * nor do any checks: - */ - if (likely(this->class->usage_mask & new_mask)) - return 1; - - if (!graph_lock()) - return 0; - /* - * Make sure we didnt race: - */ - if (unlikely(this->class->usage_mask & new_mask)) { - graph_unlock(); - return 1; - } - - this->class->usage_mask |= new_mask; + int ret = 1; - if (!save_trace(this->class->usage_traces + new_bit)) - return 0; - - switch (new_bit) { -#ifdef CONFIG_TRACE_IRQFLAGS + switch(new_bit) { case LOCK_USED_IN_HARDIRQ: if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) return 0; @@ -1771,37 +1817,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, if (softirq_verbose(this->class)) ret = 2; break; -#endif - case LOCK_USED: - /* - * Add it to the global list of classes: - */ - list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); - debug_atomic_dec(&nr_unused_locks); - break; default: - if (!debug_locks_off_graph_unlock()) - return 0; WARN_ON(1); - return 0; - } - - graph_unlock(); - - /* - * We must printk outside of the graph_lock: - */ - if (ret == 2) { - printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); - print_lock(this); - print_irqtrace_events(curr); - dump_stack(); + break; } return ret; } -#ifdef CONFIG_TRACE_IRQFLAGS /* * Mark all held locks with a usage bit: */ @@ -1890,101 +1913,268 @@ void trace_hardirqs_on(void) if (!mark_held_locks(curr, 0)) return; - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; - debug_atomic_inc(&hardirqs_on_events); + curr->hardirq_enable_ip = ip; + curr->hardirq_enable_event = ++curr->irq_events; + debug_atomic_inc(&hardirqs_on_events); +} + +EXPORT_SYMBOL(trace_hardirqs_on); + +/* + * Hardirqs were disabled: + */ +void trace_hardirqs_off(void) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks || current->lockdep_recursion)) + return; + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + if (curr->hardirqs_enabled) { + /* + * We have done an ON -> OFF transition: + */ + curr->hardirqs_enabled = 0; + curr->hardirq_disable_ip = _RET_IP_; + curr->hardirq_disable_event = ++curr->irq_events; + debug_atomic_inc(&hardirqs_off_events); + } else + debug_atomic_inc(&redundant_hardirqs_off); +} + +EXPORT_SYMBOL(trace_hardirqs_off); + +/* + * Softirqs will be enabled: + */ +void trace_softirqs_on(unsigned long ip) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks)) + return; + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + if (curr->softirqs_enabled) { + debug_atomic_inc(&redundant_softirqs_on); + return; + } + + /* + * We'll do an OFF -> ON transition: + */ + curr->softirqs_enabled = 1; + curr->softirq_enable_ip = ip; + curr->softirq_enable_event = ++curr->irq_events; + debug_atomic_inc(&softirqs_on_events); + /* + * We are going to turn softirqs on, so set the + * usage bit for all held locks, if hardirqs are + * enabled too: + */ + if (curr->hardirqs_enabled) + mark_held_locks(curr, 0); +} + +/* + * Softirqs were disabled: + */ +void trace_softirqs_off(unsigned long ip) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks)) + return; + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + if (curr->softirqs_enabled) { + /* + * We have done an ON -> OFF transition: + */ + curr->softirqs_enabled = 0; + curr->softirq_disable_ip = ip; + curr->softirq_disable_event = ++curr->irq_events; + debug_atomic_inc(&softirqs_off_events); + DEBUG_LOCKS_WARN_ON(!softirq_count()); + } else + debug_atomic_inc(&redundant_softirqs_off); +} + +static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) +{ + /* + * If non-trylock use in a hardirq or softirq context, then + * mark the lock as used in these contexts: + */ + if (!hlock->trylock) { + if (hlock->read) { + if (curr->hardirq_context) + if (!mark_lock(curr, hlock, + LOCK_USED_IN_HARDIRQ_READ)) + return 0; + if (curr->softirq_context) + if (!mark_lock(curr, hlock, + LOCK_USED_IN_SOFTIRQ_READ)) + return 0; + } else { + if (curr->hardirq_context) + if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) + return 0; + if (curr->softirq_context) + if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) + return 0; + } + } + if (!hlock->hardirqs_off) { + if (hlock->read) { + if (!mark_lock(curr, hlock, + LOCK_ENABLED_HARDIRQS_READ)) + return 0; + if (curr->softirqs_enabled) + if (!mark_lock(curr, hlock, + LOCK_ENABLED_SOFTIRQS_READ)) + return 0; + } else { + if (!mark_lock(curr, hlock, + LOCK_ENABLED_HARDIRQS)) + return 0; + if (curr->softirqs_enabled) + if (!mark_lock(curr, hlock, + LOCK_ENABLED_SOFTIRQS)) + return 0; + } + } + + return 1; +} + +static int separate_irq_context(struct task_struct *curr, + struct held_lock *hlock) +{ + unsigned int depth = curr->lockdep_depth; + + /* + * Keep track of points where we cross into an interrupt context: + */ + hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + + curr->softirq_context; + if (depth) { + struct held_lock *prev_hlock; + + prev_hlock = curr->held_locks + depth-1; + /* + * If we cross into another context, reset the + * hash key (this also prevents the checking and the + * adding of the dependency to 'prev'): + */ + if (prev_hlock->irq_context != hlock->irq_context) + return 1; + } + return 0; } -EXPORT_SYMBOL(trace_hardirqs_on); +#else -/* - * Hardirqs were disabled: - */ -void trace_hardirqs_off(void) +static inline +int mark_lock_irq(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) { - struct task_struct *curr = current; - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; + WARN_ON(1); + return 1; +} - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; +static inline int mark_irqflags(struct task_struct *curr, + struct held_lock *hlock) +{ + return 1; +} - if (curr->hardirqs_enabled) { - /* - * We have done an ON -> OFF transition: - */ - curr->hardirqs_enabled = 0; - curr->hardirq_disable_ip = _RET_IP_; - curr->hardirq_disable_event = ++curr->irq_events; - debug_atomic_inc(&hardirqs_off_events); - } else - debug_atomic_inc(&redundant_hardirqs_off); +static inline int separate_irq_context(struct task_struct *curr, + struct held_lock *hlock) +{ + return 0; } -EXPORT_SYMBOL(trace_hardirqs_off); +#endif /* - * Softirqs will be enabled: + * Mark a lock with a usage bit, and validate the state transition: */ -void trace_softirqs_on(unsigned long ip) +static int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) { - struct task_struct *curr = current; - - if (unlikely(!debug_locks)) - return; - - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; - - if (curr->softirqs_enabled) { - debug_atomic_inc(&redundant_softirqs_on); - return; - } + unsigned int new_mask = 1 << new_bit, ret = 1; /* - * We'll do an OFF -> ON transition: + * If already set then do not dirty the cacheline, + * nor do any checks: */ - curr->softirqs_enabled = 1; - curr->softirq_enable_ip = ip; - curr->softirq_enable_event = ++curr->irq_events; - debug_atomic_inc(&softirqs_on_events); + if (likely(this->class->usage_mask & new_mask)) + return 1; + + if (!graph_lock()) + return 0; /* - * We are going to turn softirqs on, so set the - * usage bit for all held locks, if hardirqs are - * enabled too: + * Make sure we didnt race: */ - if (curr->hardirqs_enabled) - mark_held_locks(curr, 0); -} - -/* - * Softirqs were disabled: - */ -void trace_softirqs_off(unsigned long ip) -{ - struct task_struct *curr = current; + if (unlikely(this->class->usage_mask & new_mask)) { + graph_unlock(); + return 1; + } - if (unlikely(!debug_locks)) - return; + this->class->usage_mask |= new_mask; - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; + if (!save_trace(this->class->usage_traces + new_bit)) + return 0; - if (curr->softirqs_enabled) { + switch (new_bit) { + case LOCK_USED_IN_HARDIRQ: + case LOCK_USED_IN_SOFTIRQ: + case LOCK_USED_IN_HARDIRQ_READ: + case LOCK_USED_IN_SOFTIRQ_READ: + case LOCK_ENABLED_HARDIRQS: + case LOCK_ENABLED_SOFTIRQS: + case LOCK_ENABLED_HARDIRQS_READ: + case LOCK_ENABLED_SOFTIRQS_READ: + ret = mark_lock_irq(curr, this, new_bit); + if (!ret) + return 0; + break; + case LOCK_USED: /* - * We have done an ON -> OFF transition: + * Add it to the global list of classes: */ - curr->softirqs_enabled = 0; - curr->softirq_disable_ip = ip; - curr->softirq_disable_event = ++curr->irq_events; - debug_atomic_inc(&softirqs_off_events); - DEBUG_LOCKS_WARN_ON(!softirq_count()); - } else - debug_atomic_inc(&redundant_softirqs_off); -} + list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); + debug_atomic_dec(&nr_unused_locks); + break; + default: + if (!debug_locks_off_graph_unlock()) + return 0; + WARN_ON(1); + return 0; + } -#endif + graph_unlock(); + + /* + * We must printk outside of the graph_lock: + */ + if (ret == 2) { + printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); + print_lock(this); + print_irqtrace_events(curr); + dump_stack(); + } + + return ret; +} /* * Initialize a lock instance's lock-class mapping info: @@ -2082,56 +2272,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->check = check; hlock->hardirqs_off = hardirqs_off; - if (check != 2) - goto out_calc_hash; -#ifdef CONFIG_TRACE_IRQFLAGS - /* - * If non-trylock use in a hardirq or softirq context, then - * mark the lock as used in these contexts: - */ - if (!trylock) { - if (read) { - if (curr->hardirq_context) - if (!mark_lock(curr, hlock, - LOCK_USED_IN_HARDIRQ_READ)) - return 0; - if (curr->softirq_context) - if (!mark_lock(curr, hlock, - LOCK_USED_IN_SOFTIRQ_READ)) - return 0; - } else { - if (curr->hardirq_context) - if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) - return 0; - if (curr->softirq_context) - if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) - return 0; - } - } - if (!hardirqs_off) { - if (read) { - if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQS_READ)) - return 0; - if (curr->softirqs_enabled) - if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQS_READ)) - return 0; - } else { - if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQS)) - return 0; - if (curr->softirqs_enabled) - if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQS)) - return 0; - } - } -#endif + if (check == 2 && !mark_irqflags(curr, hlock)) + return 0; + /* mark it as used: */ if (!mark_lock(curr, hlock, LOCK_USED)) return 0; -out_calc_hash: + /* * Calculate the chain hash: it's the combined has of all the * lock keys along the dependency chain. We save the hash value @@ -2154,77 +2301,15 @@ out_calc_hash: } hlock->prev_chain_key = chain_key; - -#ifdef CONFIG_TRACE_IRQFLAGS - /* - * Keep track of points where we cross into an interrupt context: - */ - hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + - curr->softirq_context; - if (depth) { - struct held_lock *prev_hlock; - - prev_hlock = curr->held_locks + depth-1; - /* - * If we cross into another context, reset the - * hash key (this also prevents the checking and the - * adding of the dependency to 'prev'): - */ - if (prev_hlock->irq_context != hlock->irq_context) { - chain_key = 0; - chain_head = 1; - } + if (separate_irq_context(curr, hlock)) { + chain_key = 0; + chain_head = 1; } -#endif chain_key = iterate_chain_key(chain_key, id); curr->curr_chain_key = chain_key; - /* - * Trylock needs to maintain the stack of held locks, but it - * does not add new dependencies, because trylock can be done - * in any order. - * - * We look up the chain_key and do the O(N^2) check and update of - * the dependencies only if this is a new dependency chain. - * (If lookup_chain_cache() returns with 1 it acquires - * graph_lock for us) - */ - if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) { - /* - * Check whether last held lock: - * - * - is irq-safe, if this lock is irq-unsafe - * - is softirq-safe, if this lock is hardirq-unsafe - * - * And check whether the new lock's dependency graph - * could lead back to the previous lock. - * - * any of these scenarios could lead to a deadlock. If - * All validations - */ - int ret = check_deadlock(curr, hlock, lock, read); - - if (!ret) - return 0; - /* - * Mark recursive read, as we jump over it when - * building dependencies (just like we jump over - * trylock entries): - */ - if (ret == 2) - hlock->read = 2; - /* - * Add dependency only if this lock is not the head - * of the chain, and if it's not a secondary read-lock: - */ - if (!chain_head && ret != 2) - if (!check_prevs_add(curr, hlock)) - return 0; - graph_unlock(); - } else - /* after lookup_chain_cache(): */ - if (unlikely(!debug_locks)) - return 0; + if (!validate_chain(curr, lock, hlock, chain_head)) + return 0; curr->lockdep_depth++; check_chain_key(curr); diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 58f35e5..2fde341 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -271,8 +271,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v) if (nr_list_entries) factor = sum_forward_deps / nr_list_entries; +#ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", nr_lock_chains, MAX_LOCKDEP_CHAINS); +#endif #ifdef CONFIG_TRACE_IRQFLAGS seq_printf(m, " in-hardirq chains: %11u\n", -- cgit v1.1 From f20786ff4da51e56b1956acf30be2552be266746 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jul 2007 01:48:56 -0700 Subject: lockstat: core infrastructure Introduce the core lock statistics code. Lock statistics provides lock wait-time and hold-time (as well as the count of corresponding contention and acquisitions events). Also, the first few call-sites that encounter contention are tracked. Lock wait-time is the time spent waiting on the lock. This provides insight into the locking scheme, that is, a heavily contended lock is indicative of a too coarse locking scheme. Lock hold-time is the duration the lock was held, this provides a reference for the wait-time numbers, so they can be put into perspective. 1) lock 2) ... do stuff .. unlock 3) The time between 1 and 2 is the wait-time. The time between 2 and 3 is the hold-time. The lockdep held-lock tracking code is reused, because it already collects locks into meaningful groups (classes), and because it is an existing infrastructure for lock instrumentation. Currently lockdep tracks lock acquisition with two hooks: lock() lock_acquire() _lock() ... code protected by lock ... unlock() lock_release() _unlock() We need to extend this with two more hooks, in order to measure contention. lock_contended() - used to measure contention events lock_acquired() - completion of the contention These are then placed the following way: lock() lock_acquire() if (!_try_lock()) lock_contended() _lock() lock_acquired() ... do locked stuff ... unlock() lock_release() _unlock() (Note: the try_lock() 'trick' is used to avoid instrumenting all platform dependent lock primitive implementations.) It is also possible to toggle the two lockdep features at runtime using: /proc/sys/kernel/prove_locking /proc/sys/kernel/lock_stat (esp. turning off the O(n^2) prove_locking functionaliy can help) [akpm@linux-foundation.org: build fixes] [akpm@linux-foundation.org: nuke unneeded ifdefs] Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Acked-by: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 247 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 22 +++++ 2 files changed, 269 insertions(+) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 87ac364..70ca4db 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -42,6 +42,20 @@ #include "lockdep_internals.h" +#ifdef CONFIG_PROVE_LOCKING +int prove_locking = 1; +module_param(prove_locking, int, 0644); +#else +#define prove_locking 0 +#endif + +#ifdef CONFIG_LOCK_STAT +int lock_stat = 1; +module_param(lock_stat, int, 0644); +#else +#define lock_stat 0 +#endif + /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -104,6 +118,70 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; unsigned long nr_lock_classes; static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +#ifdef CONFIG_LOCK_STAT +static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); + +static int lock_contention_point(struct lock_class *class, unsigned long ip) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { + if (class->contention_point[i] == 0) { + class->contention_point[i] = ip; + break; + } + if (class->contention_point[i] == ip) + break; + } + + return i; +} + +static void lock_time_inc(struct lock_time *lt, s64 time) +{ + if (time > lt->max) + lt->max = time; + + if (time < lt->min || !lt->min) + lt->min = time; + + lt->total += time; + lt->nr++; +} + +static struct lock_class_stats *get_lock_stats(struct lock_class *class) +{ + return &get_cpu_var(lock_stats)[class - lock_classes]; +} + +static void put_lock_stats(struct lock_class_stats *stats) +{ + put_cpu_var(lock_stats); +} + +static void lock_release_holdtime(struct held_lock *hlock) +{ + struct lock_class_stats *stats; + s64 holdtime; + + if (!lock_stat) + return; + + holdtime = sched_clock() - hlock->holdtime_stamp; + + stats = get_lock_stats(hlock->class); + if (hlock->read) + lock_time_inc(&stats->read_holdtime, holdtime); + else + lock_time_inc(&stats->write_holdtime, holdtime); + put_lock_stats(stats); +} +#else +static inline void lock_release_holdtime(struct held_lock *hlock) +{ +} +#endif + /* * We keep a global list of all lock classes. The list only grows, * never shrinks. The list is only accessed with the lockdep @@ -2221,6 +2299,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int chain_head = 0; u64 chain_key; + if (!prove_locking) + check = 1; + if (unlikely(!debug_locks)) return 0; @@ -2271,6 +2352,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->read = read; hlock->check = check; hlock->hardirqs_off = hardirqs_off; +#ifdef CONFIG_LOCK_STAT + hlock->waittime_stamp = 0; + hlock->holdtime_stamp = sched_clock(); +#endif if (check == 2 && !mark_irqflags(curr, hlock)) return 0; @@ -2411,6 +2496,8 @@ lock_release_non_nested(struct task_struct *curr, return print_unlock_inbalance_bug(curr, lock, ip); found_it: + lock_release_holdtime(hlock); + /* * We have the right lock to unlock, 'hlock' points to it. * Now we remove it from the stack, and add back the other @@ -2463,6 +2550,8 @@ static int lock_release_nested(struct task_struct *curr, curr->curr_chain_key = hlock->prev_chain_key; + lock_release_holdtime(hlock); + #ifdef CONFIG_DEBUG_LOCKDEP hlock->prev_chain_key = 0; hlock->class = NULL; @@ -2537,6 +2626,9 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; + if (unlikely(!lock_stat && !prove_locking)) + return; + if (unlikely(current->lockdep_recursion)) return; @@ -2556,6 +2648,9 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { unsigned long flags; + if (unlikely(!lock_stat && !prove_locking)) + return; + if (unlikely(current->lockdep_recursion)) return; @@ -2569,6 +2664,158 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) EXPORT_SYMBOL_GPL(lock_release); +#ifdef CONFIG_LOCK_STAT +static int +print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, + unsigned long ip) +{ + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + printk("\n=================================\n"); + printk( "[ BUG: bad contention detected! ]\n"); + printk( "---------------------------------\n"); + printk("%s/%d is trying to contend lock (", + curr->comm, curr->pid); + print_lockdep_cache(lock); + printk(") at:\n"); + print_ip_sym(ip); + printk("but there are no locks held!\n"); + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +static void +__lock_contended(struct lockdep_map *lock, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class_stats *stats; + unsigned int depth; + int i, point; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (hlock->instance == lock) + goto found_it; + prev_hlock = hlock; + } + print_lock_contention_bug(curr, lock, ip); + return; + +found_it: + hlock->waittime_stamp = sched_clock(); + + point = lock_contention_point(hlock->class, ip); + + stats = get_lock_stats(hlock->class); + if (point < ARRAY_SIZE(stats->contention_point)) + stats->contention_point[i]++; + put_lock_stats(stats); +} + +static void +__lock_acquired(struct lockdep_map *lock) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class_stats *stats; + unsigned int depth; + u64 now; + s64 waittime; + int i; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (hlock->instance == lock) + goto found_it; + prev_hlock = hlock; + } + print_lock_contention_bug(curr, lock, _RET_IP_); + return; + +found_it: + if (!hlock->waittime_stamp) + return; + + now = sched_clock(); + waittime = now - hlock->waittime_stamp; + hlock->holdtime_stamp = now; + + stats = get_lock_stats(hlock->class); + if (hlock->read) + lock_time_inc(&stats->read_waittime, waittime); + else + lock_time_inc(&stats->write_waittime, waittime); + put_lock_stats(stats); +} + +void lock_contended(struct lockdep_map *lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(!lock_stat)) + return; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + __lock_contended(lock, ip); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_contended); + +void lock_acquired(struct lockdep_map *lock) +{ + unsigned long flags; + + if (unlikely(!lock_stat)) + return; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + __lock_acquired(lock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_acquired); +#endif + /* * Used by the testsuite, sanitize the validator state * after a simulated failure: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2aaa3f9..e69179b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -161,6 +161,8 @@ extern ctl_table inotify_table[]; int sysctl_legacy_va_layout; #endif +extern int prove_locking; +extern int lock_stat; /* The default sysctl tables: */ @@ -282,6 +284,26 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_PROVE_LOCKING + { + .ctl_name = CTL_UNNUMBERED, + .procname = "prove_locking", + .data = &prove_locking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_LOCK_STAT + { + .ctl_name = CTL_UNNUMBERED, + .procname = "lock_stat", + .data = &lock_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = CTL_UNNUMBERED, .procname = "sched_features", -- cgit v1.1 From c46261de0d98372112d8edf16f74ce418a268d46 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jul 2007 01:48:57 -0700 Subject: lockstat: human readability tweaks Present all this fancy new lock statistics information: *warning, _wide_ output ahead* (output edited for purpose of brevity) # cat /proc/lock_stat lock_stat version 0.1 ----------------------------------------------------------------------------------------------------------------------------------------------------------------- class name contentions waittime-min waittime-max waittime-total acquisitions holdtime-min holdtime-max holdtime-total ----------------------------------------------------------------------------------------------------------------------------------------------------------------- &inode->i_mutex: 14458 6.57 398832.75 2469412.23 6768876 0.34 11398383.65 339410830.89 --------------- &inode->i_mutex 4486 [] pipe_wait+0x86/0x8d &inode->i_mutex 0 [] pipe_write_fasync+0x29/0x5d &inode->i_mutex 0 [] pipe_read+0x74/0x3a5 &inode->i_mutex 0 [] do_lookup+0x81/0x1ae ................................................................................................................................................................. &inode->i_data.tree_lock-W: 491 0.27 62.47 493.89 2477833 0.39 468.89 1146584.25 &inode->i_data.tree_lock-R: 65 0.44 4.27 48.78 26288792 0.36 184.62 10197458.24 -------------------------- &inode->i_data.tree_lock 46 [] __do_page_cache_readahead+0x69/0x24f &inode->i_data.tree_lock 31 [] add_to_page_cache+0x31/0xba &inode->i_data.tree_lock 0 [] __do_page_cache_readahead+0xc2/0x24f &inode->i_data.tree_lock 0 [] find_get_page+0x1a/0x58 ................................................................................................................................................................. proc_inum_idr.lock: 0 0.00 0.00 0.00 36 0.00 65.60 148.26 proc_subdir_lock: 0 0.00 0.00 0.00 3049859 0.00 106.81 1563212.42 shrinker_rwsem-W: 0 0.00 0.00 0.00 5 0.00 1.73 3.68 shrinker_rwsem-R: 0 0.00 0.00 0.00 633 2.57 246.57 10909.76 'contentions' and 'acquisitions' are the number of such events measured (since the last reset). The waittime- and holdtime- (min, max, total) numbers are presented in microseconds. If there are any contention points, the lock class is presented in the block format (as i_mutex and tree_lock above), otherwise a single line of output is presented. The output is sorted on absolute number of contentions (read + write), this should get the worst offenders presented first, so that: # grep : /proc/lock_stat | head will quickly show who's bad. The stats can be reset using: # echo 0 > /proc/lock_stat [bunk@stusta.de: make 2 functions static] [akpm@linux-foundation.org: fix printk warning] Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Acked-by: Jason Baron Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 44 +++++++++ kernel/lockdep_proc.c | 266 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 310 insertions(+) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 70ca4db..a8dc99d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -149,6 +149,50 @@ static void lock_time_inc(struct lock_time *lt, s64 time) lt->nr++; } +static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) +{ + dst->min += src->min; + dst->max += src->max; + dst->total += src->total; + dst->nr += src->nr; +} + +struct lock_class_stats lock_stats(struct lock_class *class) +{ + struct lock_class_stats stats; + int cpu, i; + + memset(&stats, 0, sizeof(struct lock_class_stats)); + for_each_possible_cpu(cpu) { + struct lock_class_stats *pcs = + &per_cpu(lock_stats, cpu)[class - lock_classes]; + + for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) + stats.contention_point[i] += pcs->contention_point[i]; + + lock_time_add(&pcs->read_waittime, &stats.read_waittime); + lock_time_add(&pcs->write_waittime, &stats.write_waittime); + + lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); + lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); + } + + return stats; +} + +void clear_lock_stats(struct lock_class *class) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct lock_class_stats *cpu_stats = + &per_cpu(lock_stats, cpu)[class - lock_classes]; + + memset(cpu_stats, 0, sizeof(struct lock_class_stats)); + } + memset(class->contention_point, 0, sizeof(class->contention_point)); +} + static struct lock_class_stats *get_lock_stats(struct lock_class *class) { return &get_cpu_var(lock_stats)[class - lock_classes]; diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 2fde341..e682926 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -15,6 +15,10 @@ #include #include #include +#include +#include +#include +#include #include "lockdep_internals.h" @@ -344,6 +348,262 @@ static const struct file_operations proc_lockdep_stats_operations = { .release = seq_release, }; +#ifdef CONFIG_LOCK_STAT + +struct lock_stat_data { + struct lock_class *class; + struct lock_class_stats stats; +}; + +struct lock_stat_seq { + struct lock_stat_data *iter; + struct lock_stat_data *iter_end; + struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; +}; + +/* + * sort on absolute number of contentions + */ +static int lock_stat_cmp(const void *l, const void *r) +{ + const struct lock_stat_data *dl = l, *dr = r; + unsigned long nl, nr; + + nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr; + nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr; + + return nr - nl; +} + +static void seq_line(struct seq_file *m, char c, int offset, int length) +{ + int i; + + for (i = 0; i < offset; i++) + seq_puts(m, " "); + for (i = 0; i < length; i++) + seq_printf(m, "%c", c); + seq_puts(m, "\n"); +} + +static void snprint_time(char *buf, size_t bufsiz, s64 nr) +{ + unsigned long rem; + + rem = do_div(nr, 1000); /* XXX: do_div_signed */ + snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10); +} + +static void seq_time(struct seq_file *m, s64 time) +{ + char num[15]; + + snprint_time(num, sizeof(num), time); + seq_printf(m, " %14s", num); +} + +static void seq_lock_time(struct seq_file *m, struct lock_time *lt) +{ + seq_printf(m, "%14lu", lt->nr); + seq_time(m, lt->min); + seq_time(m, lt->max); + seq_time(m, lt->total); +} + +static void seq_stats(struct seq_file *m, struct lock_stat_data *data) +{ + char name[39]; + struct lock_class *class; + struct lock_class_stats *stats; + int i, namelen; + + class = data->class; + stats = &data->stats; + + snprintf(name, 38, "%s", class->name); + namelen = strlen(name); + + if (stats->write_holdtime.nr) { + if (stats->read_holdtime.nr) + seq_printf(m, "%38s-W:", name); + else + seq_printf(m, "%40s:", name); + + seq_lock_time(m, &stats->write_waittime); + seq_puts(m, " "); + seq_lock_time(m, &stats->write_holdtime); + seq_puts(m, "\n"); + } + + if (stats->read_holdtime.nr) { + seq_printf(m, "%38s-R:", name); + seq_lock_time(m, &stats->read_waittime); + seq_puts(m, " "); + seq_lock_time(m, &stats->read_holdtime); + seq_puts(m, "\n"); + } + + if (stats->read_waittime.nr + stats->write_waittime.nr == 0) + return; + + if (stats->read_holdtime.nr) + namelen += 2; + + for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { + char sym[KSYM_SYMBOL_LEN]; + char ip[32]; + + if (class->contention_point[i] == 0) + break; + + if (!i) + seq_line(m, '-', 40-namelen, namelen); + + sprint_symbol(sym, class->contention_point[i]); + snprintf(ip, sizeof(ip), "[<%p>]", + (void *)class->contention_point[i]); + seq_printf(m, "%40s %14lu %29s %s\n", name, + stats->contention_point[i], + ip, sym); + } + if (i) { + seq_puts(m, "\n"); + seq_line(m, '.', 0, 40 + 1 + 8 * (14 + 1)); + seq_puts(m, "\n"); + } +} + +static void seq_header(struct seq_file *m) +{ + seq_printf(m, "lock_stat version 0.1\n"); + seq_line(m, '-', 0, 40 + 1 + 8 * (14 + 1)); + seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s\n", + "class name", + "contentions", + "waittime-min", + "waittime-max", + "waittime-total", + "acquisitions", + "holdtime-min", + "holdtime-max", + "holdtime-total"); + seq_line(m, '-', 0, 40 + 1 + 8 * (14 + 1)); + seq_printf(m, "\n"); +} + +static void *ls_start(struct seq_file *m, loff_t *pos) +{ + struct lock_stat_seq *data = m->private; + + if (data->iter == data->stats) + seq_header(m); + + return data->iter; +} + +static void *ls_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct lock_stat_seq *data = m->private; + + (*pos)++; + + data->iter = v; + data->iter++; + if (data->iter == data->iter_end) + data->iter = NULL; + + return data->iter; +} + +static void ls_stop(struct seq_file *m, void *v) +{ +} + +static int ls_show(struct seq_file *m, void *v) +{ + struct lock_stat_seq *data = m->private; + + seq_stats(m, data->iter); + return 0; +} + +static struct seq_operations lockstat_ops = { + .start = ls_start, + .next = ls_next, + .stop = ls_stop, + .show = ls_show, +}; + +static int lock_stat_open(struct inode *inode, struct file *file) +{ + int res; + struct lock_class *class; + struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq)); + + if (!data) + return -ENOMEM; + + res = seq_open(file, &lockstat_ops); + if (!res) { + struct lock_stat_data *iter = data->stats; + struct seq_file *m = file->private_data; + + data->iter = iter; + list_for_each_entry(class, &all_lock_classes, lock_entry) { + iter->class = class; + iter->stats = lock_stats(class); + iter++; + } + data->iter_end = iter; + + sort(data->stats, data->iter_end - data->iter, + sizeof(struct lock_stat_data), + lock_stat_cmp, NULL); + + m->private = data; + } else + vfree(data); + + return res; +} + +static ssize_t lock_stat_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct lock_class *class; + char c; + + if (count) { + if (get_user(c, buf)) + return -EFAULT; + + if (c != '0') + return count; + + list_for_each_entry(class, &all_lock_classes, lock_entry) + clear_lock_stats(class); + } + return count; +} + +static int lock_stat_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + vfree(seq->private); + seq->private = NULL; + return seq_release(inode, file); +} + +static const struct file_operations proc_lock_stat_operations = { + .open = lock_stat_open, + .write = lock_stat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = lock_stat_release, +}; +#endif /* CONFIG_LOCK_STAT */ + static int __init lockdep_proc_init(void) { struct proc_dir_entry *entry; @@ -356,6 +616,12 @@ static int __init lockdep_proc_init(void) if (entry) entry->proc_fops = &proc_lockdep_stats_operations; +#ifdef CONFIG_LOCK_STAT + entry = create_proc_entry("lock_stat", S_IRUSR, NULL); + if (entry) + entry->proc_fops = &proc_lock_stat_operations; +#endif + return 0; } -- cgit v1.1 From 4fe87745a6722d42ff27a60768c77958fa1fc498 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jul 2007 01:48:58 -0700 Subject: lockstat: hook into spinlock_t, rwlock_t, rwsem and mutex Call the new lockstat tracking functions from the various lock primitives. Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Acked-by: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/mutex.c | 8 ++++++++ kernel/rwsem.c | 8 ++++---- kernel/spinlock.c | 28 ++++++++++++++-------------- 3 files changed, 26 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index 303eab1..7a3f327 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -139,6 +139,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; + old_val = atomic_xchg(&lock->count, -1); + if (old_val == 1) + goto done; + + lock_contended(&lock->dep_map, _RET_IP_); + for (;;) { /* * Lets try to take the lock again - this is needed even if @@ -174,6 +180,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) spin_lock_mutex(&lock->wait_lock, flags); } + lock_acquired(&lock->dep_map); +done: /* got the lock - rejoice! */ mutex_remove_waiter(lock, &waiter, task_thread_info(task)); debug_mutex_set_owner(lock, task_thread_info(task)); diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 9a87886..1ec620c0 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -20,7 +20,7 @@ void down_read(struct rw_semaphore *sem) might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); - __down_read(sem); + LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } EXPORT_SYMBOL(down_read); @@ -47,7 +47,7 @@ void down_write(struct rw_semaphore *sem) might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - __down_write(sem); + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } EXPORT_SYMBOL(down_write); @@ -111,7 +111,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); - __down_read(sem); + LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } EXPORT_SYMBOL(down_read_nested); @@ -130,7 +130,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - __down_write_nested(sem, subclass); + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } EXPORT_SYMBOL(down_write_nested); diff --git a/kernel/spinlock.c b/kernel/spinlock.c index cd93bfe..cd72424 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -72,7 +72,7 @@ void __lockfunc _read_lock(rwlock_t *lock) { preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - _raw_read_lock(lock); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); } EXPORT_SYMBOL(_read_lock); @@ -89,7 +89,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) * that interrupts are not re-enabled during lock-acquire: */ #ifdef CONFIG_LOCKDEP - _raw_spin_lock(lock); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); #else _raw_spin_lock_flags(lock, &flags); #endif @@ -102,7 +102,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock) local_irq_disable(); preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_spin_lock(lock); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } EXPORT_SYMBOL(_spin_lock_irq); @@ -111,7 +111,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock) local_bh_disable(); preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_spin_lock(lock); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } EXPORT_SYMBOL(_spin_lock_bh); @@ -122,7 +122,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) local_irq_save(flags); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - _raw_read_lock(lock); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); return flags; } EXPORT_SYMBOL(_read_lock_irqsave); @@ -132,7 +132,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock) local_irq_disable(); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - _raw_read_lock(lock); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); } EXPORT_SYMBOL(_read_lock_irq); @@ -141,7 +141,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock) local_bh_disable(); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - _raw_read_lock(lock); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); } EXPORT_SYMBOL(_read_lock_bh); @@ -152,7 +152,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) local_irq_save(flags); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_write_lock(lock); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); return flags; } EXPORT_SYMBOL(_write_lock_irqsave); @@ -162,7 +162,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock) local_irq_disable(); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_write_lock(lock); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); } EXPORT_SYMBOL(_write_lock_irq); @@ -171,7 +171,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock) local_bh_disable(); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_write_lock(lock); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); } EXPORT_SYMBOL(_write_lock_bh); @@ -179,7 +179,7 @@ void __lockfunc _spin_lock(spinlock_t *lock) { preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_spin_lock(lock); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } EXPORT_SYMBOL(_spin_lock); @@ -188,7 +188,7 @@ void __lockfunc _write_lock(rwlock_t *lock) { preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_write_lock(lock); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); } EXPORT_SYMBOL(_write_lock); @@ -289,7 +289,7 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) { preempt_disable(); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - _raw_spin_lock(lock); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } EXPORT_SYMBOL(_spin_lock_nested); @@ -306,7 +306,7 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas * that interrupts are not re-enabled during lock-acquire: */ #ifdef CONFIG_LOCKDEP - _raw_spin_lock(lock); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); #else _raw_spin_lock_flags(lock, &flags); #endif -- cgit v1.1 From 4b32d0a4e9ec07808a5c406a416c6576c986b047 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jul 2007 01:48:59 -0700 Subject: lockdep: various fixes - update the copyright notices - use the default hash function - fix a thinko in a BUILD_BUG_ON - add a WARN_ON to spot inconsitent naming - fix a termination issue in /proc/lock_stat [akpm@linux-foundation.org: cleanups] Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 21 ++++++++++++--------- kernel/lockdep_proc.c | 6 +++++- 2 files changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index a8dc99d..cb64022 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -5,7 +5,8 @@ * * Started by Ingo Molnar: * - * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * this code maps all the lock dependencies as they occur in a live kernel * and will warn about the following classes of locking bugs: @@ -37,6 +38,7 @@ #include #include #include +#include #include @@ -238,8 +240,7 @@ LIST_HEAD(all_lock_classes); */ #define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) #define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) -#define CLASSHASH_MASK (CLASSHASH_SIZE - 1) -#define __classhashfn(key) ((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK) +#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) #define classhashentry(key) (classhash_table + __classhashfn((key))) static struct list_head classhash_table[CLASSHASH_SIZE]; @@ -250,9 +251,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE]; */ #define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) #define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) -#define CHAINHASH_MASK (CHAINHASH_SIZE - 1) -#define __chainhashfn(chain) \ - (((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK) +#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) #define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) static struct list_head chainhash_table[CHAINHASH_SIZE]; @@ -676,7 +675,8 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) * (or spin_lock_init()) call - which acts as the key. For static * locks we use the lock object itself as the key. */ - BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class)); + BUILD_BUG_ON(sizeof(struct lock_class_key) > + sizeof(struct lockdep_map)); key = lock->key->subkeys + subclass; @@ -686,9 +686,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) * We can walk the hash lockfree, because the hash only * grows, and we are careful when adding entries to the end: */ - list_for_each_entry(class, hash_head, hash_entry) - if (class->key == key) + list_for_each_entry(class, hash_head, hash_entry) { + if (class->key == key) { + WARN_ON_ONCE(class->name != lock->name); return class; + } + } return NULL; } diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index e682926..39163ed 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -5,7 +5,8 @@ * * Started by Ingo Molnar: * - * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Code for /proc/lockdep and /proc/lockdep_stats: * @@ -498,6 +499,9 @@ static void *ls_start(struct seq_file *m, loff_t *pos) if (data->iter == data->stats) seq_header(m); + if (data->iter == data->iter_end) + data->iter = NULL; + return data->iter; } -- cgit v1.1 From 96645678cd726e87ce42a0664de71e047e32bca4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jul 2007 01:49:00 -0700 Subject: lockstat: measure lock bouncing __acquire | lock _____ | \ | __contended | | | wait | _______/ |/ | __acquired | __release | unlock We measure acquisition and contention bouncing. This is done by recording a cpu stamp in each lock instance. Contention bouncing requires the cpu stamp to be set on acquisition. Hence we move __acquired into the generic path. __acquired is then used to measure acquisition bouncing by comparing the current cpu with the old stamp before replacing it. __contended is used to measure contention bouncing (only useful for preemptable locks) [akpm@linux-foundation.org: cleanups] Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 38 ++++++++++++++++++++++++++------------ kernel/lockdep_proc.c | 19 ++++++++++++------- kernel/mutex.c | 2 +- 3 files changed, 39 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index cb64022..156fce4 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -177,6 +177,9 @@ struct lock_class_stats lock_stats(struct lock_class *class) lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); + + for (i = 0; i < ARRAY_SIZE(stats.bounces); i++) + stats.bounces[i] += pcs->bounces[i]; } return stats; @@ -2325,6 +2328,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, lock->name = name; lock->key = key; lock->class_cache = NULL; +#ifdef CONFIG_LOCK_STAT + lock->cpu = raw_smp_processor_id(); +#endif if (subclass) register_lock_class(lock, subclass, 1); } @@ -2775,6 +2781,8 @@ found_it: stats = get_lock_stats(hlock->class); if (point < ARRAY_SIZE(stats->contention_point)) stats->contention_point[i]++; + if (lock->cpu != smp_processor_id()) + stats->bounces[bounce_contended + !!hlock->read]++; put_lock_stats(stats); } @@ -2786,8 +2794,8 @@ __lock_acquired(struct lockdep_map *lock) struct lock_class_stats *stats; unsigned int depth; u64 now; - s64 waittime; - int i; + s64 waittime = 0; + int i, cpu; depth = curr->lockdep_depth; if (DEBUG_LOCKS_WARN_ON(!depth)) @@ -2809,19 +2817,25 @@ __lock_acquired(struct lockdep_map *lock) return; found_it: - if (!hlock->waittime_stamp) - return; - - now = sched_clock(); - waittime = now - hlock->waittime_stamp; - hlock->holdtime_stamp = now; + cpu = smp_processor_id(); + if (hlock->waittime_stamp) { + now = sched_clock(); + waittime = now - hlock->waittime_stamp; + hlock->holdtime_stamp = now; + } stats = get_lock_stats(hlock->class); - if (hlock->read) - lock_time_inc(&stats->read_waittime, waittime); - else - lock_time_inc(&stats->write_waittime, waittime); + if (waittime) { + if (hlock->read) + lock_time_inc(&stats->read_waittime, waittime); + else + lock_time_inc(&stats->write_waittime, waittime); + } + if (lock->cpu != cpu) + stats->bounces[bounce_acquired + !!hlock->read]++; put_lock_stats(stats); + + lock->cpu = cpu; } void lock_contended(struct lockdep_map *lock, unsigned long ip) diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 39163ed..7ff8013 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -430,16 +430,18 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) else seq_printf(m, "%40s:", name); + seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]); seq_lock_time(m, &stats->write_waittime); - seq_puts(m, " "); + seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]); seq_lock_time(m, &stats->write_holdtime); seq_puts(m, "\n"); } if (stats->read_holdtime.nr) { seq_printf(m, "%38s-R:", name); + seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]); seq_lock_time(m, &stats->read_waittime); - seq_puts(m, " "); + seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]); seq_lock_time(m, &stats->read_holdtime); seq_puts(m, "\n"); } @@ -469,26 +471,29 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) } if (i) { seq_puts(m, "\n"); - seq_line(m, '.', 0, 40 + 1 + 8 * (14 + 1)); + seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); seq_puts(m, "\n"); } } static void seq_header(struct seq_file *m) { - seq_printf(m, "lock_stat version 0.1\n"); - seq_line(m, '-', 0, 40 + 1 + 8 * (14 + 1)); - seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s\n", + seq_printf(m, "lock_stat version 0.2\n"); + seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); + seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " + "%14s %14s\n", "class name", + "con-bounces", "contentions", "waittime-min", "waittime-max", "waittime-total", + "acq-bounces", "acquisitions", "holdtime-min", "holdtime-max", "holdtime-total"); - seq_line(m, '-', 0, 40 + 1 + 8 * (14 + 1)); + seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); seq_printf(m, "\n"); } diff --git a/kernel/mutex.c b/kernel/mutex.c index 7a3f327..691b865 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -180,8 +180,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) spin_lock_mutex(&lock->wait_lock, flags); } - lock_acquired(&lock->dep_map); done: + lock_acquired(&lock->dep_map); /* got the lock - rejoice! */ mutex_remove_waiter(lock, &waiter, task_thread_info(task)); debug_mutex_set_owner(lock, task_thread_info(task)); -- cgit v1.1 From d38e1d5aaee384698fcef9455d6e2df1d062a1d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jul 2007 01:49:01 -0700 Subject: lockstat: better class name representation optionally add class->name_version and class->subclass to the class name Signed-off-by: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep_proc.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 7ff8013..9f17af4 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -421,8 +421,30 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) class = data->class; stats = &data->stats; - snprintf(name, 38, "%s", class->name); + namelen = 38; + if (class->name_version > 1) + namelen -= 2; /* XXX truncates versions > 9 */ + if (class->subclass) + namelen -= 2; + + if (!class->name) { + char str[KSYM_NAME_LEN]; + const char *key_name; + + key_name = __get_key_name(class->key, str); + snprintf(name, namelen, "%s", key_name); + } else { + snprintf(name, namelen, "%s", class->name); + } namelen = strlen(name); + if (class->name_version > 1) { + snprintf(name+namelen, 3, "#%d", class->name_version); + namelen += 2; + } + if (class->subclass) { + snprintf(name+namelen, 3, "/%d", class->subclass); + namelen += 2; + } if (stats->write_holdtime.nr) { if (stats->read_holdtime.nr) -- cgit v1.1 From c71063c9c9dc232d0d51f936f237f7dc5681e8e3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 19 Jul 2007 01:49:02 -0700 Subject: lockdep debugging: give stacktrace for init_error When I started adding support for lockdep to 64-bit powerpc, I got a lockdep_init_error and with this patch was able to pinpoint why and where to put lockdep_init(). Let's support this generally for others adding lockdep support to their architecture. Signed-off-by: Johannes Berg Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 156fce4..734da57 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -375,6 +375,11 @@ unsigned int max_recursion_depth; * about it later on, in lockdep_info(). */ static int lockdep_init_error; +static unsigned long lockdep_init_trace_data[20]; +static struct stack_trace lockdep_init_trace = { + .max_entries = ARRAY_SIZE(lockdep_init_trace_data), + .entries = lockdep_init_trace_data, +}; /* * Various lockdep statistics: @@ -662,6 +667,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) if (unlikely(!lockdep_initialized)) { lockdep_init(); lockdep_init_error = 1; + save_stack_trace(&lockdep_init_trace); } #endif @@ -3040,8 +3046,11 @@ void __init lockdep_info(void) sizeof(struct held_lock) * MAX_LOCK_DEPTH); #ifdef CONFIG_DEBUG_LOCKDEP - if (lockdep_init_error) - printk("WARNING: lockdep init error! Arch code didnt call lockdep_init() early enough?\n"); + if (lockdep_init_error) { + printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); + printk("Call stack leading to lockdep invocation was:\n"); + print_stack_trace(&lockdep_init_trace, 0); + } #endif } -- cgit v1.1 From 71120f183bff04ba4f7ba3cc554202061912d548 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 19 Jul 2007 01:49:16 -0700 Subject: timekeeping: fixup shadow variable argument clocksource_adjust() has a clock argument, which shadows the file global clock variable. Fix this up. Signed-off-by: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/timekeeping.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 728cedf..8969877 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -401,7 +401,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * this is optimized for the most common adjustments of -1,0,1, * for other values we can do a bit more work. */ -static void clocksource_adjust(struct clocksource *clock, s64 offset) +static void clocksource_adjust(s64 offset) { s64 error, interval = clock->cycle_interval; int adj; @@ -476,7 +476,7 @@ void update_wall_time(void) } /* correct the clock when NTP error is too big */ - clocksource_adjust(clock, offset); + clocksource_adjust(offset); /* store full nanoseconds into xtime */ xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; -- cgit v1.1 From 6819457d2cb7fe4fdb0fc3655b6b6dc71a86bee9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 19 Jul 2007 01:49:16 -0700 Subject: timer.c: cleanup recently introduced whitespace damage Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index b7792fb..d1e8b97 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -103,14 +103,14 @@ static inline tvec_base_t *tbase_get_base(tvec_base_t *base) static inline void timer_set_deferrable(struct timer_list *timer) { timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | - TBASE_DEFERRABLE_FLAG)); + TBASE_DEFERRABLE_FLAG)); } static inline void timer_set_base(struct timer_list *timer, tvec_base_t *new_base) { timer->base = (tvec_base_t *)((unsigned long)(new_base) | - tbase_get_deferrable(timer->base)); + tbase_get_deferrable(timer->base)); } /** @@ -445,10 +445,10 @@ EXPORT_SYMBOL(__mod_timer); void add_timer_on(struct timer_list *timer, int cpu) { tvec_base_t *base = per_cpu(tvec_bases, cpu); - unsigned long flags; + unsigned long flags; timer_stats_timer_set_start_info(timer); - BUG_ON(timer_pending(timer) || !timer->function); + BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); internal_add_timer(base, timer); @@ -627,7 +627,7 @@ static inline void __run_timers(tvec_base_t *base) while (time_after_eq(jiffies, base->timer_jiffies)) { struct list_head work_list; struct list_head *head = &work_list; - int index = base->timer_jiffies & TVR_MASK; + int index = base->timer_jiffies & TVR_MASK; /* * Cascade timers: @@ -644,8 +644,8 @@ static inline void __run_timers(tvec_base_t *base) unsigned long data; timer = list_first_entry(head, struct timer_list,entry); - fn = timer->function; - data = timer->data; + fn = timer->function; + data = timer->data; timer_stats_account_timer(timer); @@ -689,8 +689,8 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base) index = slot = timer_jiffies & TVR_MASK; do { list_for_each_entry(nte, base->tv1.vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) - continue; + if (tbase_get_deferrable(nte->base)) + continue; found = 1; expires = nte->expires; @@ -834,7 +834,7 @@ void update_process_times(int user_tick) if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); scheduler_tick(); - run_posix_cpu_timers(p); + run_posix_cpu_timers(p); } /* @@ -909,7 +909,7 @@ static inline void update_times(unsigned long ticks) update_wall_time(); calc_load(ticks); } - + /* * The 64-bit jiffies value is not atomic - you MUST NOT read it * without sampling the sequence number in xtime_lock. @@ -1105,7 +1105,7 @@ asmlinkage long sys_gettid(void) /** * do_sysinfo - fill in sysinfo struct * @info: pointer to buffer to fill - */ + */ int do_sysinfo(struct sysinfo *info) { unsigned long mem_total, sav_total; -- cgit v1.1 From 5992b6dac0d23a2b51a1ccbaf8f1a2e62097b12b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 19 Jul 2007 01:49:21 -0700 Subject: lguest: export symbols for lguest as a module lguest does some fairly lowlevel things to support a host, which normal modules don't need: math_state_restore: When the guest triggers a Device Not Available fault, we need to be able to restore the FPU __put_task_struct: We need to hold a reference to another task for inter-guest I/O, and put_task_struct() is an inline function which calls __put_task_struct. access_process_vm: We need to access another task for inter-guest I/O. map_vm_area & __get_vm_area: We need to map the switcher shim (ie. monitor) at 0xFFC01000. Signed-off-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4698389..e7a2d99 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -127,6 +127,7 @@ void __put_task_struct(struct task_struct *tsk) if (!profile_handoff_task(tsk)) free_task(tsk); } +EXPORT_SYMBOL_GPL(__put_task_struct); void __init fork_init(unsigned long mempages) { -- cgit v1.1 From d7e28ffe6c74416b54345d6004fd0964c115b12c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 19 Jul 2007 01:49:23 -0700 Subject: lguest: the host code This is the code for the "lg.ko" module, which allows lguest guests to be launched. [akpm@linux-foundation.org: update for futex-new-private-futexes] [akpm@linux-foundation.org: build fix] [jmorris@namei.org: lguest: use hrtimers] [akpm@linux-foundation.org: x86_64 build fix] Signed-off-by: Rusty Russell Cc: Andi Kleen Cc: Eric Dumazet Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e7a2d99..4698389 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -127,7 +127,6 @@ void __put_task_struct(struct task_struct *tsk) if (!profile_handoff_task(tsk)) free_task(tsk); } -EXPORT_SYMBOL_GPL(__put_task_struct); void __init fork_init(unsigned long mempages) { -- cgit v1.1 From ed2c12f323e8fafbc94f9bcfb924f9df36e64dc7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 Jul 2007 01:50:35 -0700 Subject: kernel/sysctl.c: finish off the warning comments I've been chasing these comments around this file all week. Hopefully we're straight now. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e69179b..2222998 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -748,7 +748,10 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dostring, .strategy = &sysctl_string, }, - +/* + * NOTE: do not add new entries to this table unless you have read + * Documentation/sysctl/ctl_unnumbered.txt + */ { .ctl_name = 0 } }; -- cgit v1.1 From 9439aab8dbc33c2c03c3a19dba267360383ba38c Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 19 Jul 2007 21:28:35 +0200 Subject: [PATCH] sched: fix newly idle load balance in case of SMT In the presence of SMT, newly idle balance was never happening for multi-core and SMP domains (even when both the logical siblings are idle). If thread 0 is already idle and when thread 1 is about to go to idle, newly idle load balance always think that one of the threads is not idle and skips doing the newly idle load balance for multi-core and SMP domains. This is because of the idle_cpu() macro, which checks if the current process on a cpu is an idle process. But this is not the case for the thread doing the load_balance_newidle(). Fix this by using runqueue's nr_running field instead of idle_cpu(). And also skip the logic of 'only one idle cpu in the group will be doing load balancing' during newly idle case. Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 645256b..e36d99d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2235,7 +2235,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, rq = cpu_rq(i); - if (*sd_idle && !idle_cpu(i)) + if (*sd_idle && rq->nr_running) *sd_idle = 0; /* Bias balancing toward cpus of our domain */ @@ -2257,9 +2257,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* * First idle cpu or the first cpu(busiest) in this sched group * is eligible for doing load balancing at this and above - * domains. + * domains. In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. */ - if (local_group && balance_cpu != this_cpu && balance) { + if (idle != CPU_NEWLY_IDLE && local_group && + balance_cpu != this_cpu && balance) { *balance = 0; goto ret; } -- cgit v1.1 From 969bb4e4032dac67287951d8f6642a3b5119694e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 19 Jul 2007 21:28:35 +0200 Subject: [PATCH] sched: fix the all pinned logic in load_balance_newidle() nr_moved is not the correct check for triggering all pinned logic. Fix the all pinned logic in the case of load_balance_newidle(). Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e36d99d..a35a92f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2679,6 +2679,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) unsigned long imbalance; int nr_moved = 0; int sd_idle = 0; + int all_pinned = 0; cpumask_t cpus = CPU_MASK_ALL; /* @@ -2717,10 +2718,11 @@ redo: double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, minus_1_or_zero(busiest->nr_running), - imbalance, sd, CPU_NEWLY_IDLE, NULL); + imbalance, sd, CPU_NEWLY_IDLE, + &all_pinned); spin_unlock(&busiest->lock); - if (!nr_moved) { + if (unlikely(all_pinned)) { cpu_clear(cpu_of(busiest), cpus); if (!cpus_empty(cpus)) goto redo; -- cgit v1.1 From e436d80085133858bf2613a630365e8a0459fd58 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 19 Jul 2007 21:28:35 +0200 Subject: [PATCH] sched: implement cpu_clock(cpu) high-speed time source Implement the cpu_clock(cpu) interface for kernel-internal use: high-speed (but slightly incorrect) per-cpu clock constructed from sched_clock(). This API, unused at the moment, will be used in the future by blktrace, by the softlockup-watchdog, by printk and by lockstat. Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a35a92f..93cf241 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -379,6 +379,23 @@ static inline unsigned long long rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +/* + * For kernel-internal use: high-speed (but slightly incorrect) per-cpu + * clock constructed from sched_clock(): + */ +unsigned long long cpu_clock(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long long now; + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + now = rq_clock(rq); + spin_unlock_irqrestore(&rq->lock, flags); + + return now; +} + #ifdef CONFIG_FAIR_GROUP_SCHED /* Change a task's ->cfs_rq if it moves across CPUs */ static inline void set_task_cfs_rq(struct task_struct *p) -- cgit v1.1 From 20c2df83d25c6a95affe6157a4c9cac4cf5ffaac Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 20 Jul 2007 10:11:58 +0900 Subject: mm: Remove slab destructors from kmem_cache_create(). Slab destructors were no longer supported after Christoph's c59def9f222d44bb7e2f0a559f2906191a0862d7 change. They've been BUGs for both slab and slub, and slob never supported them either. This rips out support for the dtor pointer from kmem_cache_create() completely and fixes up every single callsite in the kernel (there were about 224, not including the slab allocator definitions themselves, or the documentation references). Signed-off-by: Paul Mundt --- kernel/fork.c | 18 +++++++++--------- kernel/nsproxy.c | 2 +- kernel/posix-timers.c | 2 +- kernel/user.c | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4698389..7332e23 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -137,7 +137,7 @@ void __init fork_init(unsigned long mempages) /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", sizeof(struct task_struct), - ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL); + ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); #endif /* @@ -1446,22 +1446,22 @@ void __init proc_caches_init(void) sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, - sighand_ctor, NULL); + sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); - files_cachep = kmem_cache_create("files_cache", + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); - fs_cachep = kmem_cache_create("fs_cache", + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); vm_area_cachep = kmem_cache_create("vm_area_struct", sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL, NULL); + SLAB_PANIC, NULL); mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); } /* diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 10f0bbb..a4fb7d4 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -193,7 +193,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, static int __init nsproxy_cache_init(void) { nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy), - 0, SLAB_PANIC, NULL, NULL); + 0, SLAB_PANIC, NULL); return 0; } diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 329ce01..55b3761 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -241,7 +241,7 @@ static __init int init_posix_timers(void) register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, 0, NULL, NULL); + sizeof (struct k_itimer), 0, 0, NULL); idr_init(&posix_timers_id); return 0; } diff --git a/kernel/user.c b/kernel/user.c index 98b8250..e7d11ce 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -208,7 +208,7 @@ static int __init uid_cache_init(void) int n; uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); for(n = 0; n < UIDHASH_SZ; ++n) INIT_LIST_HEAD(init_user_ns.uidhash_table + n); -- cgit v1.1 From 1f564ad6d4182859612cbae452122e5eb2d62a76 Mon Sep 17 00:00:00 2001 From: Bob Picco Date: Wed, 18 Jul 2007 15:51:28 -0700 Subject: [IA64] remove time interpolator Remove time_interpolator code (This is generic code, but only user was ia64. It has been superseded by the CONFIG_GENERIC_TIME code). Signed-off-by: Bob Picco Signed-off-by: John Stultz Signed-off-by: Peter Keilty Signed-off-by: Tony Luck --- kernel/time.c | 88 ---------------------- kernel/time/ntp.c | 10 --- kernel/time/timekeeping.c | 4 - kernel/timer.c | 188 ---------------------------------------------- 4 files changed, 290 deletions(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index ffe1914..e325597 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -136,7 +136,6 @@ static inline void warp_clock(void) write_seqlock_irq(&xtime_lock); wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; xtime.tv_sec += sys_tz.tz_minuteswest * 60; - time_interpolator_reset(); write_sequnlock_irq(&xtime_lock); clock_was_set(); } @@ -309,92 +308,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran) } EXPORT_SYMBOL(timespec_trunc); -#ifdef CONFIG_TIME_INTERPOLATION -void getnstimeofday (struct timespec *tv) -{ - unsigned long seq,sec,nsec; - - do { - seq = read_seqbegin(&xtime_lock); - sec = xtime.tv_sec; - nsec = xtime.tv_nsec+time_interpolator_get_offset(); - } while (unlikely(read_seqretry(&xtime_lock, seq))); - - while (unlikely(nsec >= NSEC_PER_SEC)) { - nsec -= NSEC_PER_SEC; - ++sec; - } - tv->tv_sec = sec; - tv->tv_nsec = nsec; -} -EXPORT_SYMBOL_GPL(getnstimeofday); - -int do_settimeofday (struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - { - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - time_adjust = 0; /* stop active adjtime() */ - time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; - time_interpolator_reset(); - } - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} -EXPORT_SYMBOL(do_settimeofday); - -void do_gettimeofday (struct timeval *tv) -{ - unsigned long seq, nsec, usec, sec, offset; - do { - seq = read_seqbegin(&xtime_lock); - offset = time_interpolator_get_offset(); - sec = xtime.tv_sec; - nsec = xtime.tv_nsec; - } while (unlikely(read_seqretry(&xtime_lock, seq))); - - usec = (nsec + offset) / 1000; - - while (unlikely(usec >= USEC_PER_SEC)) { - usec -= USEC_PER_SEC; - ++sec; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; - - /* - * Make sure xtime.tv_sec [returned by sys_time()] always - * follows the gettimeofday() result precisely. This - * condition is extremely unlikely, it can hit at most - * once per second: - */ - if (unlikely(xtime.tv_sec != tv->tv_sec)) { - unsigned long flags; - - write_seqlock_irqsave(&xtime_lock, flags); - update_wall_time(); - write_sequnlock_irqrestore(&xtime_lock, flags); - } -} -EXPORT_SYMBOL(do_gettimeofday); - -#else /* CONFIG_TIME_INTERPOLATION */ - #ifndef CONFIG_GENERIC_TIME /* * Simulate gettimeofday using do_gettimeofday which only allows a timeval @@ -410,7 +323,6 @@ void getnstimeofday(struct timespec *tv) } EXPORT_SYMBOL_GPL(getnstimeofday); #endif -#endif /* CONFIG_TIME_INTERPOLATION */ /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 438c6b7..b5e3525 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -116,11 +116,6 @@ void second_overflow(void) if (xtime.tv_sec % 86400 == 0) { xtime.tv_sec--; wall_to_monotonic.tv_sec++; - /* - * The timer interpolator will make time change - * gradually instead of an immediate jump by one second - */ - time_interpolator_update(-NSEC_PER_SEC); time_state = TIME_OOP; printk(KERN_NOTICE "Clock: inserting leap second " "23:59:60 UTC\n"); @@ -130,11 +125,6 @@ void second_overflow(void) if ((xtime.tv_sec + 1) % 86400 == 0) { xtime.tv_sec++; wall_to_monotonic.tv_sec--; - /* - * Use of time interpolator for a gradual change of - * time - */ - time_interpolator_update(NSEC_PER_SEC); time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second " "23:59:59 UTC\n"); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 728cedf..027d46c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -466,10 +466,6 @@ void update_wall_time(void) second_overflow(); } - /* interpolator bits */ - time_interpolator_update(clock->xtime_interval - >> clock->shift); - /* accumulate error between NTP and clock interval */ clock->error += current_tick_length(); clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); diff --git a/kernel/timer.c b/kernel/timer.c index b7792fb..dbc03ab 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1349,194 +1349,6 @@ void __init init_timers(void) open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); } -#ifdef CONFIG_TIME_INTERPOLATION - -struct time_interpolator *time_interpolator __read_mostly; -static struct time_interpolator *time_interpolator_list __read_mostly; -static DEFINE_SPINLOCK(time_interpolator_lock); - -static inline cycles_t time_interpolator_get_cycles(unsigned int src) -{ - unsigned long (*x)(void); - - switch (src) - { - case TIME_SOURCE_FUNCTION: - x = time_interpolator->addr; - return x(); - - case TIME_SOURCE_MMIO64 : - return readq_relaxed((void __iomem *)time_interpolator->addr); - - case TIME_SOURCE_MMIO32 : - return readl_relaxed((void __iomem *)time_interpolator->addr); - - default: return get_cycles(); - } -} - -static inline u64 time_interpolator_get_counter(int writelock) -{ - unsigned int src = time_interpolator->source; - - if (time_interpolator->jitter) - { - cycles_t lcycle; - cycles_t now; - - do { - lcycle = time_interpolator->last_cycle; - now = time_interpolator_get_cycles(src); - if (lcycle && time_after(lcycle, now)) - return lcycle; - - /* When holding the xtime write lock, there's no need - * to add the overhead of the cmpxchg. Readers are - * force to retry until the write lock is released. - */ - if (writelock) { - time_interpolator->last_cycle = now; - return now; - } - /* Keep track of the last timer value returned. The use of cmpxchg here - * will cause contention in an SMP environment. - */ - } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle)); - return now; - } - else - return time_interpolator_get_cycles(src); -} - -void time_interpolator_reset(void) -{ - time_interpolator->offset = 0; - time_interpolator->last_counter = time_interpolator_get_counter(1); -} - -#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) - -unsigned long time_interpolator_get_offset(void) -{ - /* If we do not have a time interpolator set up then just return zero */ - if (!time_interpolator) - return 0; - - return time_interpolator->offset + - GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator); -} - -#define INTERPOLATOR_ADJUST 65536 -#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST - -void time_interpolator_update(long delta_nsec) -{ - u64 counter; - unsigned long offset; - - /* If there is no time interpolator set up then do nothing */ - if (!time_interpolator) - return; - - /* - * The interpolator compensates for late ticks by accumulating the late - * time in time_interpolator->offset. A tick earlier than expected will - * lead to a reset of the offset and a corresponding jump of the clock - * forward. Again this only works if the interpolator clock is running - * slightly slower than the regular clock and the tuning logic insures - * that. - */ - - counter = time_interpolator_get_counter(1); - offset = time_interpolator->offset + - GET_TI_NSECS(counter, time_interpolator); - - if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) - time_interpolator->offset = offset - delta_nsec; - else { - time_interpolator->skips++; - time_interpolator->ns_skipped += delta_nsec - offset; - time_interpolator->offset = 0; - } - time_interpolator->last_counter = counter; - - /* Tuning logic for time interpolator invoked every minute or so. - * Decrease interpolator clock speed if no skips occurred and an offset is carried. - * Increase interpolator clock speed if we skip too much time. - */ - if (jiffies % INTERPOLATOR_ADJUST == 0) - { - if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec) - time_interpolator->nsec_per_cyc--; - if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0) - time_interpolator->nsec_per_cyc++; - time_interpolator->skips = 0; - time_interpolator->ns_skipped = 0; - } -} - -static inline int -is_better_time_interpolator(struct time_interpolator *new) -{ - if (!time_interpolator) - return 1; - return new->frequency > 2*time_interpolator->frequency || - (unsigned long)new->drift < (unsigned long)time_interpolator->drift; -} - -void -register_time_interpolator(struct time_interpolator *ti) -{ - unsigned long flags; - - /* Sanity check */ - BUG_ON(ti->frequency == 0 || ti->mask == 0); - - ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency; - spin_lock(&time_interpolator_lock); - write_seqlock_irqsave(&xtime_lock, flags); - if (is_better_time_interpolator(ti)) { - time_interpolator = ti; - time_interpolator_reset(); - } - write_sequnlock_irqrestore(&xtime_lock, flags); - - ti->next = time_interpolator_list; - time_interpolator_list = ti; - spin_unlock(&time_interpolator_lock); -} - -void -unregister_time_interpolator(struct time_interpolator *ti) -{ - struct time_interpolator *curr, **prev; - unsigned long flags; - - spin_lock(&time_interpolator_lock); - prev = &time_interpolator_list; - for (curr = *prev; curr; curr = curr->next) { - if (curr == ti) { - *prev = curr->next; - break; - } - prev = &curr->next; - } - - write_seqlock_irqsave(&xtime_lock, flags); - if (ti == time_interpolator) { - /* we lost the best time-interpolator: */ - time_interpolator = NULL; - /* find the next-best interpolator */ - for (curr = time_interpolator_list; curr; curr = curr->next) - if (is_better_time_interpolator(curr)) - time_interpolator = curr; - time_interpolator_reset(); - } - write_sequnlock_irqrestore(&xtime_lock, flags); - spin_unlock(&time_interpolator_lock); -} -#endif /* CONFIG_TIME_INTERPOLATION */ - /** * msleep - sleep safely even with waitqueue interruptions * @msecs: Time in milliseconds to sleep for -- cgit v1.1 From 0b1937ac0ef1541c0ea44e6f81c33d2f59803957 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 20 Jul 2007 17:02:04 +0100 Subject: FRV: Fix linkage problems Make it possible to use __start_notes and __stop_notes without getting a GPREL overflow error from the FRV linker. Small variables that would otherwise be in .data or .bss may, depending on the arch, be placed in special sections (.sdata or .sbss) that permit single instruction references on fixed instruction width machines. __start_notes and __stop_notes aren't really char variables, and certainly don't refer to data in .data or .bss. Making them type "void" fools the compiler into not assuming anything about them. Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/ksysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 2565e1b..d0e5c48 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -65,8 +65,8 @@ KERNEL_ATTR_RO(kexec_crash_loaded); /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ -extern const char __start_notes __attribute__((weak)); -extern const char __stop_notes __attribute__((weak)); +extern const void __start_notes __attribute__((weak)); +extern const void __stop_notes __attribute__((weak)); #define notes_size (&__stop_notes - &__start_notes) static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr, -- cgit v1.1 From 2008220879af095d00ca27eb168a55c8595fbc0b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 20 Jul 2007 13:28:54 -0700 Subject: Revert "sys_time() speedup" This basically reverts commit 4e44f3497d41db4c3b9051c61410dee8ae4fb49c, while waiting for it to be re-done more completely. There are cases of people mixing "time()" with higher-resolution time sources, and we need to take the nanosecond offsets into account. Ingo has a patch that does that, but it's still under some discussion. In the meantime, just revert back to the old simple situation of just doing the whole exact timesource calculations. But rather than using do_gettimeofday(), use the internal nanosecond resolution getnstimeofday(), which at least avoids one unnecessary conversion (since we really don't care about whether the fractional seconds are nanoseconds or microseconds - we'll just throw them away). Signed-off-by: Linus Torvalds --- kernel/time.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index e325597..5b81da0 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -57,17 +57,14 @@ EXPORT_SYMBOL(sys_tz); */ asmlinkage long sys_time(time_t __user * tloc) { - /* - * We read xtime.tv_sec atomically - it's updated - * atomically by update_wall_time(), so no need to - * even read-lock the xtime seqlock: - */ - time_t i = xtime.tv_sec; + time_t i; + struct timespec tv; - smp_rmb(); /* sys_time() results are coherent */ + getnstimeofday(&tv); + i = tv.tv_sec; if (tloc) { - if (put_user(i, tloc)) + if (put_user(i,tloc)) i = -EFAULT; } return i; -- cgit v1.1 From 18de5bc4c1f1f1fa5e14f354a7603bd6e9d4e3b6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 21 Jul 2007 04:37:34 -0700 Subject: clockevents: fix resume logic We need to make sure, that the clockevent devices are resumed, before the tick is resumed. The current resume logic does not guarantee this. Add CLOCK_EVT_MODE_RESUME and call the set mode functions of the clock event devices before resuming the tick / oneshot functionality. Fixup the existing users. Thanks to Nigel Cunningham for tracking down a long standing thinko, which affected the jinxed VAIO. [akpm@linux-foundation.org: xen build fix] Signed-off-by: Thomas Gleixner Cc: john stultz Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/tick-broadcast.c | 6 ++++-- kernel/time/tick-common.c | 16 ++++++++++------ 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 8001d37..8339af2 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -49,7 +49,7 @@ cpumask_t *tick_get_broadcast_mask(void) */ static void tick_broadcast_start_periodic(struct clock_event_device *bc) { - if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN) + if (bc) tick_setup_periodic(bc, 1); } @@ -299,7 +299,7 @@ void tick_suspend_broadcast(void) spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; - if (bc && tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + if (bc) clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); spin_unlock_irqrestore(&tick_broadcast_lock, flags); @@ -316,6 +316,8 @@ int tick_resume_broadcast(void) bc = tick_broadcast_device.evtdev; if (bc) { + clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); + switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: if(!cpus_empty(tick_broadcast_mask)) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index a96ec9ab..77a21ab 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -318,12 +318,17 @@ static void tick_resume(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); unsigned long flags; + int broadcast = tick_resume_broadcast(); spin_lock_irqsave(&tick_device_lock, flags); - if (td->mode == TICKDEV_MODE_PERIODIC) - tick_setup_periodic(td->evtdev, 0); - else - tick_resume_oneshot(); + clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); + + if (!broadcast) { + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(td->evtdev, 0); + else + tick_resume_oneshot(); + } spin_unlock_irqrestore(&tick_device_lock, flags); } @@ -360,8 +365,7 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason, break; case CLOCK_EVT_NOTIFY_RESUME: - if (!tick_resume_broadcast()) - tick_resume(); + tick_resume(); break; default: -- cgit v1.1 From 5590a536c0bc403fc73908c66c1c88cbed735ecb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 21 Jul 2007 04:37:35 -0700 Subject: clockevents: fix device replacement When a device is replaced by a better rated device, then the broadcast mode needs to be evaluated again. When the new device has no requirement for broadcasting, then the broadcast bits for the CPU must be cleared. Signed-off-by: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/tick-broadcast.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 8339af2..db8e0f3 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -31,6 +31,12 @@ struct tick_device tick_broadcast_device; static cpumask_t tick_broadcast_mask; static DEFINE_SPINLOCK(tick_broadcast_lock); +#ifdef CONFIG_TICK_ONESHOT +static void tick_broadcast_clear_oneshot(int cpu); +#else +static inline void tick_broadcast_clear_oneshot(int cpu) { } +#endif + /* * Debugging: see timer_list.c */ @@ -99,8 +105,19 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) cpu_set(cpu, tick_broadcast_mask); tick_broadcast_start_periodic(tick_broadcast_device.evtdev); ret = 1; - } + } else { + /* + * When the new device is not affected by the stop + * feature and the cpu is marked in the broadcast mask + * then clear the broadcast bit. + */ + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { + int cpu = smp_processor_id(); + cpu_clear(cpu, tick_broadcast_mask); + tick_broadcast_clear_oneshot(cpu); + } + } spin_unlock_irqrestore(&tick_broadcast_lock, flags); return ret; } @@ -487,6 +504,16 @@ out: spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +/* + * Reset the one shot broadcast for a cpu + * + * Called with tick_broadcast_lock held + */ +static void tick_broadcast_clear_oneshot(int cpu) +{ + cpu_clear(cpu, tick_broadcast_oneshot_mask); +} + /** * tick_broadcast_setup_highres - setup the broadcast device for highres */ -- cgit v1.1 From 3704540b48295253bd9c87a5e7ff545f9d47a3b8 Mon Sep 17 00:00:00 2001 From: john stultz Date: Sat, 21 Jul 2007 04:37:35 -0700 Subject: tick management: spread timer interrupt After discussing w/ Thomas over IRC, it seems the issue is the sched tick fires on every cpu at the same time, causing extra lock contention. This smaller change, adds an extra offset per cpu so the ticks don't line up. This patch also drops the idle latency from 40us down to under 20us. Signed-off-by: john stultz Signed-off-by: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/tick-sched.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 52db9e3..b416995 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -546,6 +546,7 @@ void tick_setup_sched_timer(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now = ktime_get(); + u64 offset; /* * Emulate tick processing via per-CPU hrtimers: @@ -554,8 +555,12 @@ void tick_setup_sched_timer(void) ts->sched_timer.function = tick_sched_timer; ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; - /* Get the next period */ + /* Get the next period (per cpu) */ ts->sched_timer.expires = tick_init_jiffy_update(); + offset = ktime_to_ns(tick_period) >> 1; + do_div(offset, NR_CPUS); + offset *= smp_processor_id(); + ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset); for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); -- cgit v1.1 From 820de5c39ef7f6866d2c9e6c7d208bcd2a6e1942 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 21 Jul 2007 04:37:36 -0700 Subject: highres: improve debug output Add some more debug information to the hrtimer and clock events code. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 5 ++++- kernel/time/tick-oneshot.c | 15 ++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 72d0342..065a897 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -558,7 +558,8 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, */ static int hrtimer_switch_to_hres(void) { - struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + int cpu = smp_processor_id(); + struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; if (base->hres_active) @@ -568,6 +569,8 @@ static int hrtimer_switch_to_hres(void) if (tick_init_highres()) { local_irq_restore(flags); + printk(KERN_WARNING "Could not switch to high resolution " + "mode on CPU %d\n", cpu); return 0; } base->hres_active = 1; diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index f6997ab..0258d31 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -73,8 +73,21 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) struct clock_event_device *dev = td->evtdev; if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || - !tick_device_is_functional(dev)) + !tick_device_is_functional(dev)) { + + printk(KERN_INFO "Clockevents: " + "could not switch to one-shot mode:"); + if (!dev) { + printk(" no tick device\n"); + } else { + if (!tick_device_is_functional(dev)) + printk(" %s is not functional.\n", dev->name); + else + printk(" %s does not support one-shot mode.\n", + dev->name); + } return -EINVAL; + } td->mode = TICKDEV_MODE_ONESHOT; dev->event_handler = handler; -- cgit v1.1 From 99bc2fcb283852931fb6bbef40f3df8316b59000 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 21 Jul 2007 04:37:36 -0700 Subject: hrtimer: speedup hrtimer_enqueue Speedup hrtimer_enqueue by evaluating the rbtree insertion result. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 065a897..eb1ddeb 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -686,6 +686,7 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct rb_node **link = &base->active.rb_node; struct rb_node *parent = NULL; struct hrtimer *entry; + int leftmost = 1; /* * Find the right place in the rbtree: @@ -697,18 +698,19 @@ static void enqueue_hrtimer(struct hrtimer *timer, * We dont care about collisions. Nodes with * the same expiry time stay together. */ - if (timer->expires.tv64 < entry->expires.tv64) + if (timer->expires.tv64 < entry->expires.tv64) { link = &(*link)->rb_left; - else + } else { link = &(*link)->rb_right; + leftmost = 0; + } } /* * Insert the timer to the rbtree and check whether it * replaces the first pending timer */ - if (!base->first || timer->expires.tv64 < - rb_entry(base->first, struct hrtimer, node)->expires.tv64) { + if (leftmost) { /* * Reprogram the clock event device. When the timer is already * expired hrtimer_enqueue_reprogram has either called the -- cgit v1.1 From 82644459c592a28a3eab682f9b88d81019ddfe8b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 21 Jul 2007 04:37:37 -0700 Subject: NTP: move the cmos update code into ntp.c i386 and sparc64 have the identical code to update the cmos clock. Move it into kernel/time/ntp.c as there are other architectures coming along with the same requirements. [akpm@linux-foundation.org: build fixes] Signed-off-by: Thomas Gleixner Cc: Chris Wright Cc: Ingo Molnar Cc: john stultz Cc: David Miller Cc: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index b5e3525..cd91237 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -175,12 +176,64 @@ u64 current_tick_length(void) return tick_length; } +#ifdef CONFIG_GENERIC_CMOS_UPDATE -void __attribute__ ((weak)) notify_arch_cmos_timer(void) +/* Disable the cmos update - used by virtualization and embedded */ +int no_sync_cmos_clock __read_mostly; + +static void sync_cmos_clock(unsigned long dummy); + +static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); + +static void sync_cmos_clock(unsigned long dummy) +{ + struct timespec now, next; + int fail = 1; + + /* + * If we have an externally synchronized Linux clock, then update + * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be + * called as close as possible to 500 ms before the new second starts. + * This code is run on a timer. If the clock is set, that timer + * may not expire at the correct time. Thus, we adjust... + */ + if (!ntp_synced()) + /* + * Not synced, exit, do not restart a timer (if one is + * running, let it run out). + */ + return; + + getnstimeofday(&now); + if (abs(xtime.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) + fail = update_persistent_clock(now); + + next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; + if (next.tv_nsec <= 0) + next.tv_nsec += NSEC_PER_SEC; + + if (!fail) + next.tv_sec = 659; + else + next.tv_sec = 0; + + if (next.tv_nsec >= NSEC_PER_SEC) { + next.tv_sec++; + next.tv_nsec -= NSEC_PER_SEC; + } + mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); +} + +static void notify_cmos_timer(void) { - return; + if (no_sync_cmos_clock) + mod_timer(&sync_cmos_timer, jiffies + 1); } +#else +static inline void notify_cmos_timer(void) { } +#endif + /* adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ @@ -345,6 +398,6 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) txc->stbcnt = 0; write_sequnlock_irq(&xtime_lock); do_gettimeofday(&txc->time); - notify_arch_cmos_timer(); + notify_cmos_timer(); return(result); } -- cgit v1.1 From 42ee2b74140b69fa24da1c671b03c9f8019e6f62 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 21 Jul 2007 17:09:54 +0200 Subject: x86_64: Report the pending irq if available in smp_affinity Otherwise smp_affinity would only update after the next interrupt on x86 systems. Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/irq/proc.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index b4f1674..50b81b9 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -19,7 +19,15 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity); + struct irq_desc *desc = irq_desc + (long)data; + cpumask_t *mask = &desc->affinity; + int len; + +#ifdef CONFIG_GENERIC_PENDING_IRQ + if (desc->status & IRQ_MOVE_PENDING) + mask = &desc->pending_mask; +#endif + len = cpumask_scnprintf(page, count, *mask); if (count - len < 2) return -EINVAL; -- cgit v1.1 From 44bf4cea43816d43deab73c1c16361e899996eaa Mon Sep 17 00:00:00 2001 From: Nigel Cunningham Date: Sat, 21 Jul 2007 17:10:41 +0200 Subject: x86: PM_TRACE support Signed-off-by: Nigel Cunningham Cc: Randy Dunlap Cc: "Rafael J. Wysocki" Cc: Pavel Machek Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 7358609..c1a106d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -57,7 +57,7 @@ config DISABLE_CONSOLE_SUSPEND config PM_TRACE bool "Suspend/resume event tracing" - depends on PM_DEBUG && X86_32 && EXPERIMENTAL + depends on PM_DEBUG && X86 && EXPERIMENTAL default n ---help--- This enables some cheesy code to save the last PM event point in the -- cgit v1.1 From 5b9a4262232d632c28990fcdf4f36d0e0ade5f18 Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Tue, 29 May 2007 10:38:18 -0400 Subject: [PATCH] Make IPC mode consistent The mode fields for IPC records are not consistent. Some are hex, others are octal. This patch makes them all octal. Signed-off-by: Steve Grubb Signed-off-by: Al Viro --- kernel/auditsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 145cbb7..f5e917e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -995,7 +995,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case AUDIT_IPC: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - "ouid=%u ogid=%u mode=%x", + "ouid=%u ogid=%u mode=%#o", axi->uid, axi->gid, axi->mode); if (axi->osid != 0) { char *ctx = NULL; @@ -1014,7 +1014,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case AUDIT_IPC_SET_PERM: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - "qbytes=%lx ouid=%u ogid=%u mode=%x", + "qbytes=%lx ouid=%u ogid=%u mode=%#o", axi->qbytes, axi->uid, axi->gid, axi->mode); break; } -- cgit v1.1 From c926e4f432af0f61ac2b9b637fb51a4871a3fc91 Mon Sep 17 00:00:00 2001 From: Klaus Weidner Date: Wed, 16 May 2007 17:45:42 -0500 Subject: [PATCH] audit: fix broken class-based syscall audit The sanity check in audit_match_class() is wrong. We are able to audit 2048 syscalls but in audit_match_class() we were accidentally using sizeof(_u32) instead of number of bits in _u32 when deciding how many syscalls were valid. On ia64 in particular we were hitting syscall numbers over the (wrong) limit of 256. Fixing the audit_match_class check takes care of the problem. Signed-off-by: Klaus Weidner Signed-off-by: Al Viro --- kernel/auditfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 1bf093d..0ea96ba 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -304,7 +304,7 @@ int __init audit_register_class(int class, unsigned *list) int audit_match_class(int class, unsigned syscall) { - if (unlikely(syscall >= AUDIT_BITMASK_SIZE * sizeof(__u32))) + if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32)) return 0; if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class])) return 0; -- cgit v1.1 From 74f2345b6be1410f824cb7dd638d2c10a9709379 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 4 Jun 2007 17:00:14 -0400 Subject: [PATCH] allow audit filtering on bit & operations Right now the audit filter can match on = != > < >= blah blah blah. This allow the filter to also look at bitwise AND operations, & Signed-off-by: Eric Paris Signed-off-by: Al Viro --- kernel/auditfilter.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 0ea96ba..359645c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -456,6 +456,13 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_DEVMINOR: case AUDIT_EXIT: case AUDIT_SUCCESS: + /* bit ops are only useful on syscall args */ + if (f->op == AUDIT_BIT_MASK || + f->op == AUDIT_BIT_TEST) { + err = -EINVAL; + goto exit_free; + } + break; case AUDIT_ARG0: case AUDIT_ARG1: case AUDIT_ARG2: @@ -1566,6 +1573,10 @@ int audit_comparator(const u32 left, const u32 op, const u32 right) return (left > right); case AUDIT_GREATER_THAN_OR_EQUAL: return (left >= right); + case AUDIT_BIT_MASK: + return (left & right); + case AUDIT_BIT_TEST: + return ((left & right) == right); } BUG(); return 0; -- cgit v1.1 From 4259fa01a2d2aa3e589b34ba7624080232d9c1ff Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Jun 2007 11:13:31 -0400 Subject: [PATCH] get rid of AVC_PATH postponed treatment Selinux folks had been complaining about the lack of AVC_PATH records when audit is disabled. I must admit my stupidity - I assumed that avc_audit() really couldn't use audit_log_d_path() because of deadlocks (== could be called with dcache_lock or vfsmount_lock held). Shouldn't have made that assumption - it never gets called that way. It _is_ called under spinlocks, but not those. Since audit_log_d_path() uses ab->gfp_mask for allocations, kmalloc() in there is not a problem. IOW, the simple fix is sufficient: let's rip AUDIT_AVC_PATH out and simply generate pathname as part of main record. It's trivial to do. Signed-off-by: Al Viro Acked-by: James Morris --- kernel/auditsc.c | 47 ----------------------------------------------- 1 file changed, 47 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f5e917e..bde1124 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -173,12 +173,6 @@ struct audit_aux_data_fd_pair { int fd[2]; }; -struct audit_aux_data_path { - struct audit_aux_data d; - struct dentry *dentry; - struct vfsmount *mnt; -}; - struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; @@ -654,12 +648,6 @@ static inline void audit_free_aux(struct audit_context *context) struct audit_aux_data *aux; while ((aux = context->aux)) { - if (aux->type == AUDIT_AVC_PATH) { - struct audit_aux_data_path *axi = (void *)aux; - dput(axi->dentry); - mntput(axi->mnt); - } - context->aux = aux->next; kfree(aux); } @@ -1038,11 +1026,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_hex(ab, axs->a, axs->len); break; } - case AUDIT_AVC_PATH: { - struct audit_aux_data_path *axi = (void *)aux; - audit_log_d_path(ab, "path=", axi->dentry, axi->mnt); - break; } - case AUDIT_FD_PAIR: { struct audit_aux_data_fd_pair *axs = (void *)aux; audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); @@ -1991,36 +1974,6 @@ void __audit_ptrace(struct task_struct *t) } /** - * audit_avc_path - record the granting or denial of permissions - * @dentry: dentry to record - * @mnt: mnt to record - * - * Returns 0 for success or NULL context or < 0 on error. - * - * Called from security/selinux/avc.c::avc_audit() - */ -int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) -{ - struct audit_aux_data_path *ax; - struct audit_context *context = current->audit_context; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->dentry = dget(dentry); - ax->mnt = mntget(mnt); - - ax->d.type = AUDIT_AVC_PATH; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - -/** * audit_signal_info - record signal info for shutting down audit subsystem * @sig: signal value * @t: task being signaled -- cgit v1.1 From abd4f7505bafdd6c5319fe3cb5caf9af6104e17a Mon Sep 17 00:00:00 2001 From: Masoud Asgharifard Sharbiani Date: Sun, 22 Jul 2007 11:12:28 +0200 Subject: x86: i386-show-unhandled-signals-v3 This patch makes the i386 behave the same way that x86_64 does when a segfault happens. A line gets printed to the kernel log so that tools that need to check for failures can behave more uniformly between debug.show_unhandled_signals sysctl variable to 0 (or by doing echo 0 > /proc/sys/debug/exception-trace) Also, all of the lines being printed are now using printk_ratelimit() to deny the ability of DoS from a local user with a program like the following: main() { while (1) if (!fork()) *(int *)0 = 0; } This new revision also includes the fix that Andrew did which got rid of new sysctl that was added to the system in earlier versions of this. Also, 'show-unhandled-signals' sysctl has been renamed back to the old 'exception-trace' to avoid breakage of people's scripts. AK: Enabling by default for i386 will be likely controversal, but let's see what happens AK: Really folks, before complaining just fix your segfaults AK: I bet this will find a lot of silent issues Signed-off-by: Masoud Sharbiani Signed-off-by: Andi Kleen [ Personally, I've found the complaints useful on x86-64, so I'm all for this. That said, I wonder if we could do it more prettily.. -Linus ] Signed-off-by: Linus Torvalds --- kernel/signal.c | 10 ++++++++++ kernel/sysctl.c | 10 ++++++++++ 2 files changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 39d1227..ef8156a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -255,6 +255,16 @@ flush_signal_handlers(struct task_struct *t, int force_default) } } +int unhandled_signal(struct task_struct *tsk, int sig) +{ + if (is_init(tsk)) + return 1; + if (tsk->ptrace & PT_PTRACED) + return 0; + return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || + (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); +} + /* Notify the system that a driver wants to block all signals for this * process, and wants to be notified if any signals at all were to be diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2222998..ddebf3f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1203,6 +1203,16 @@ static ctl_table fs_table[] = { }; static ctl_table debug_table[] = { +#ifdef CONFIG_X86 + { + .ctl_name = CTL_UNNUMBERED, + .procname = "exception-trace", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif { .ctl_name = 0 } }; -- cgit v1.1