From a3d03ecaf9fe722bf96e4ef4a2f5e42ef652ddeb Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 13 Apr 2009 15:23:53 +0800 Subject: tracing: Fix power tracer header Before patch: # tracer: power # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | [ 676.875865889] CSTATE: Going to C1 on cpu 0 for 0.005911463 [ 676.882938805] CSTATE: Going to C1 on cpu 0 for 0.104796532 ... After patch: # tracer: power # # TIMESTAMP STATE EVENT # | | | [ 676.875865889] CSTATE: Going to C1 on cpu 0 for 0.005911463 [ 676.882938805] CSTATE: Going to C1 on cpu 0 for 0.104796532 ... v2: Use seq_puts instead of seq_printf Signed-off-by: Zhao Lei Cc: Arjan van de Ven Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <49E2E889.5000903@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_power.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index bae791e..1184397 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -186,6 +186,12 @@ static enum print_line_t power_print_line(struct trace_iterator *iter) return TRACE_TYPE_UNHANDLED; } +static void power_print_header(struct seq_file *s) +{ + seq_puts(s, "# TIMESTAMP STATE EVENT\n"); + seq_puts(s, "# | | |\n"); +} + static struct tracer power_tracer __read_mostly = { .name = "power", @@ -194,6 +200,7 @@ static struct tracer power_tracer __read_mostly = .stop = stop_power_trace, .reset = power_trace_reset, .print_line = power_print_line, + .print_header = power_print_header, }; static int init_power_trace(void) -- cgit v1.1 From 4be6f6bb66111c9468733a4ed9cad10dc3a762c0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 13 Apr 2009 14:39:33 -0700 Subject: mm: move the scan_unevictable_pages sysctl to the vm table vm knobs should go in the vm table. Probably too late for randomize_va_space though. Signed-off-by: Peter Zijlstra Acked-by: Lee Schermerhorn Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4286b62..e3d2c7d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -902,16 +902,6 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif -#ifdef CONFIG_UNEVICTABLE_LRU - { - .ctl_name = CTL_UNNUMBERED, - .procname = "scan_unevictable_pages", - .data = &scan_unevictable_pages, - .maxlen = sizeof(scan_unevictable_pages), - .mode = 0644, - .proc_handler = &scan_unevictable_handler, - }, -#endif #ifdef CONFIG_SLOW_WORK { .ctl_name = CTL_UNNUMBERED, @@ -1302,6 +1292,16 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif +#ifdef CONFIG_UNEVICTABLE_LRU + { + .ctl_name = CTL_UNNUMBERED, + .procname = "scan_unevictable_pages", + .data = &scan_unevictable_pages, + .maxlen = sizeof(scan_unevictable_pages), + .mode = 0644, + .proc_handler = &scan_unevictable_handler, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt -- cgit v1.1 From f1671f6d783a2385d32e11f456cbe32f0e4b4b49 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 13 Apr 2009 14:40:03 -0700 Subject: ptrace: fix exit_ptrace() vs ptrace_traceme() race Pointed out by Roland. The bug was recently introduced by me in "forget_original_parent: split out the un-ptrace part", commit 39c626ae47c469abdfd30c6e42eff884931380d6. Since that patch we have a window after exit_ptrace() drops tasklist and before forget_original_parent() takes it again. In this window the child can do ptrace(PTRACE_TRACEME) and nobody can untrace this child after that. Change ptrace_traceme() to not attach to the exiting ->real_parent. We don't report the error in this case, we pretend we attach right before ->real_parent calls exit_ptrace() which should untrace us anyway. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 64191fa..dfcd83c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -604,10 +604,11 @@ repeat: ret = security_ptrace_traceme(current->parent); /* - * Set the ptrace bit in the process ptrace flags. - * Then link us on our parent's ptraced list. + * Check PF_EXITING to ensure ->real_parent has not passed + * exit_ptrace(). Otherwise we don't report the error but + * pretend ->real_parent untraces us right after return. */ - if (!ret) { + if (!ret && !(current->real_parent->flags & PF_EXITING)) { current->ptrace |= PT_PTRACED; __ptrace_link(current, current->real_parent); } -- cgit v1.1 From 3d26dcf7679c5cc6c9f3b95ffdb2152fba2b7fae Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 13 Apr 2009 14:40:08 -0700 Subject: kernel/sys.c: clean up sys_shutdown exit path Impact: cleanup, fix Clean up sys_shutdown() exit path. Factor out common code. Return correct error code instead of always 0 on failure. Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 51dbb55..e7998cf 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -360,6 +360,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, void __user *, arg) { char buffer[256]; + int ret = 0; /* We only trust the superuser with rebooting the system. */ if (!capable(CAP_SYS_BOOT)) @@ -397,7 +398,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, kernel_halt(); unlock_kernel(); do_exit(0); - break; + panic("cannot halt"); case LINUX_REBOOT_CMD_POWER_OFF: kernel_power_off(); @@ -417,29 +418,22 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, #ifdef CONFIG_KEXEC case LINUX_REBOOT_CMD_KEXEC: - { - int ret; - ret = kernel_kexec(); - unlock_kernel(); - return ret; - } + ret = kernel_kexec(); + break; #endif #ifdef CONFIG_HIBERNATION case LINUX_REBOOT_CMD_SW_SUSPEND: - { - int ret = hibernate(); - unlock_kernel(); - return ret; - } + ret = hibernate(); + break; #endif default: - unlock_kernel(); - return -EINVAL; + ret = -EINVAL; + break; } unlock_kernel(); - return 0; + return ret; } static void deferred_cad(struct work_struct *dummy) -- cgit v1.1 From 132380a06b24704fd6c9be55c44d4ef3972cead2 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 2 Apr 2009 14:18:25 +0800 Subject: tracing, sched: mark get_parent_ip() notrace Impact: remove overly redundant tracing entries When tracer is "function" or "function_graph", way too much "get_parent_ip" entries are recorded in ring_buffer. Signed-off-by: Lai Jiangshan Acked-by: Frederic Weisbecker Acked-by: Steven Rostedt LKML-Reference: <49D458B1.5000703@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5724508..e90e70e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4846,7 +4846,7 @@ void scheduler_tick(void) #endif } -unsigned long get_parent_ip(unsigned long addr) +notrace unsigned long get_parent_ip(unsigned long addr) { if (in_lock_functions(addr)) { addr = CALLER_ADDR2; -- cgit v1.1 From 557055bebe9212dfa6b9f5df811dfd0dac77ec55 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 13 Apr 2009 16:02:34 +0800 Subject: tracing: Fix branch tracer header Before patch: # tracer: branch # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | <...>-2981 [000] 24008.872738: [ ok ] trace_irq_handler_exit:irq_event_types.h:41 <...>-2981 [000] 24008.872742: [ ok ] note_interrupt:spurious.c:229 ... After patch: # tracer: branch # # TASK-PID CPU# TIMESTAMP CORRECT FUNC:FILE:LINE # | | | | | | <...>-2985 [000] 26329.142970: [ ok ] slab_free:slub.c:1776 <...>-2985 [000] 26329.142972: [ ok ] trace_kmem_cache_free:kmem_event_types.h:191 ... Signed-off-by: Zhao Lei Acked-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <49E2F19A.3040006@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_branch.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index ad8c22e..8333715 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -155,6 +155,13 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter, return TRACE_TYPE_HANDLED; } +static void branch_print_header(struct seq_file *s) +{ + seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT" + " FUNC:FILE:LINE\n"); + seq_puts(s, "# | | | | | " + " |\n"); +} static struct trace_event trace_branch_event = { .type = TRACE_BRANCH, @@ -169,6 +176,7 @@ static struct tracer branch_trace __read_mostly = #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_branch, #endif /* CONFIG_FTRACE_SELFTEST */ + .print_header = branch_print_header, }; __init static int init_branch_tracer(void) -- cgit v1.1 From ef631b0ca01655d24e9ca7e199262c4a46416a26 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Apr 2009 21:31:16 -0700 Subject: rcu: Make hierarchical RCU less IPI-happy This patch fixes a hierarchical-RCU performance bug located by Anton Blanchard. The problem stems from a misguided attempt to provide a work-around for jiffies-counter failure. This work-around uses a per-CPU n_rcu_pending counter, which is incremented on each call to rcu_pending(), which in turn is called from each scheduling-clock interrupt. Each CPU then treats this counter as a surrogate for the jiffies counter, so that if the jiffies counter fails to advance, the per-CPU n_rcu_pending counter will cause RCU to invoke force_quiescent_state(), which in turn will (among other things) send resched IPIs to CPUs that have thus far failed to pass through an RCU quiescent state. Unfortunately, each CPU resets only its own counter after sending a batch of IPIs. This means that the other CPUs will also (needlessly) send -another- round of IPIs, for a full N-squared set of IPIs in the worst case every three scheduler-clock ticks until the grace period finally ends. It is not reasonable for a given CPU to reset each and every n_rcu_pending for all the other CPUs, so this patch instead simply disables the jiffies-counter "training wheels", thus eliminating the excessive IPIs. Note that the jiffies-counter IPIs do not have this problem due to the fact that the jiffies counter is global, so that the CPU sending the IPIs can easily reset things, thus preventing the other CPUs from sending redundant IPIs. Note also that the n_rcu_pending counter remains, as it will continue to be used for tracing. It may also see use to update the jiffies counter, should an appropriate kick-the-jiffies-counter API appear. Located-by: Anton Blanchard Tested-by: Anton Blanchard Signed-off-by: Paul E. McKenney Cc: anton@samba.org Cc: akpm@linux-foundation.org Cc: dipankar@in.ibm.com Cc: manfred@colorfullife.com Cc: cl@linux-foundation.org Cc: josht@linux.vnet.ibm.com Cc: schamp@sgi.com Cc: niv@us.ibm.com Cc: dvhltc@us.ibm.com Cc: ego@in.ibm.com Cc: laijs@cn.fujitsu.com Cc: rostedt@goodmis.org Cc: peterz@infradead.org Cc: penberg@cs.helsinki.fi Cc: andi@firstfloor.org Cc: "Paul E. McKenney" LKML-Reference: <12396834793575-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 19 ++++--------------- kernel/rcutree_trace.c | 14 +++++--------- 2 files changed, 9 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7f32669..d2a372f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -530,8 +530,6 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) rdp->qs_pending = 1; rdp->passed_quiesc = 0; rdp->gpnum = rsp->gpnum; - rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending + - RCU_JIFFIES_TILL_FORCE_QS; } /* @@ -578,8 +576,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rsp->gpnum++; rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; - rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending + - RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); dyntick_record_completed(rsp, rsp->completed - 1); note_new_gpnum(rsp, rdp); @@ -1055,7 +1051,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) { unsigned long flags; long lastcomp; - struct rcu_data *rdp = rsp->rda[smp_processor_id()]; struct rcu_node *rnp = rcu_get_root(rsp); u8 signaled; @@ -1066,16 +1061,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) return; /* Someone else is already on the job. */ } if (relaxed && - (long)(rsp->jiffies_force_qs - jiffies) >= 0 && - (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0) + (long)(rsp->jiffies_force_qs - jiffies) >= 0) goto unlock_ret; /* no emergency and done recently. */ rsp->n_force_qs++; spin_lock(&rnp->lock); lastcomp = rsp->completed; signaled = rsp->signaled; rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; - rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending + - RCU_JIFFIES_TILL_FORCE_QS; if (lastcomp == rsp->gpnum) { rsp->n_force_qs_ngp++; spin_unlock(&rnp->lock); @@ -1144,8 +1136,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) * If an RCU GP has gone long enough, go check for dyntick * idle CPUs and, if needed, send resched IPIs. */ - if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 || - (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0) + if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) force_quiescent_state(rsp, 1); /* @@ -1230,8 +1221,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), if (unlikely(++rdp->qlen > qhimark)) { rdp->blimit = LONG_MAX; force_quiescent_state(rsp, 0); - } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 || - (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0) + } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) force_quiescent_state(rsp, 1); local_irq_restore(flags); } @@ -1290,8 +1280,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Has an RCU GP gone long enough to send resched IPIs &c? */ if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && - ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 || - (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)) + ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) return 1; /* nothing to do */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 4ee954f..4b1875b 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -49,14 +49,12 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x", + seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', rdp->completed, rdp->gpnum, rdp->passed_quiesc, rdp->passed_quiesc_completed, - rdp->qs_pending, - rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending, - (int)(rdp->n_rcu_pending & 0xffff)); + rdp->qs_pending); #ifdef CONFIG_NO_HZ seq_printf(m, " dt=%d/%d dn=%d df=%lu", rdp->dynticks->dynticks, @@ -102,14 +100,12 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) return; - seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld", + seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"", rdp->completed, rdp->gpnum, rdp->passed_quiesc, rdp->passed_quiesc_completed, - rdp->qs_pending, - rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending, - rdp->n_rcu_pending); + rdp->qs_pending); #ifdef CONFIG_NO_HZ seq_printf(m, ",%d,%d,%d,%lu", rdp->dynticks->dynticks, @@ -123,7 +119,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) static int show_rcudata_csv(struct seq_file *m, void *unused) { - seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\","); + seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); #ifdef CONFIG_NO_HZ seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ -- cgit v1.1 From 6ec3cfeca04622e3d80c9270191cd7f5f88214af Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Mon, 13 Apr 2009 15:20:58 -0700 Subject: x86, irq: Remove IRQ_DISABLED check in process context IRQ move As discussed in the thread here: http://marc.info/?l=linux-kernel&m=123964468521142&w=2 Eric W. Biederman observed: > It looks like some additional bugs have slipped in since last I looked. > > set_irq_affinity does this: > ifdef CONFIG_GENERIC_PENDING_IRQ > if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { > cpumask_copy(desc->affinity, cpumask); > desc->chip->set_affinity(irq, cpumask); > } else { > desc->status |= IRQ_MOVE_PENDING; > cpumask_copy(desc->pending_mask, cpumask); > } > #else > > That IRQ_DISABLED case is a software state and as such it has nothing to > do with how safe it is to move an irq in process context. [...] > > The only reason we migrate MSIs in interrupt context today is that there > wasn't infrastructure for support migration both in interrupt context > and outside of it. Yes. The idea here was to force the MSI migration to happen in process context. One of the patches in the series did disable_irq(dev->irq); irq_set_affinity(dev->irq, cpumask_of(dev->cpu)); enable_irq(dev->irq); with the above patch adding irq/manage code check for interrupt disabled and moving the interrupt in process context. IIRC, there was no IRQ_MOVE_PCNTXT when we were developing this HPET code and we ended up having this ugly hack. IRQ_MOVE_PCNTXT was there when we eventually submitted the patch upstream. But, looks like I did a blind rebasing instead of using IRQ_MOVE_PCNTXT in hpet MSI code. Below patch fixes this. i.e., revert commit 932775a4ab622e3c99bd59f14cc and add PCNTXT to HPET MSI setup. Also removes copying of desc->affinity in generic code as set_affinity routines are doing it internally. Reported-by: "Eric W. Biederman" Signed-off-by: Venkatesh Pallipadi Acked-by: "Eric W. Biederman" Cc: "Li Shaohua" Cc: Gary Hade Cc: "lcm@us.ibm.com" Cc: suresh.b.siddha@intel.com LKML-Reference: <20090413222058.GB8211@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7e2e7dd..2734eca 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -109,10 +109,9 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { - cpumask_copy(desc->affinity, cpumask); + if (desc->status & IRQ_MOVE_PCNTXT) desc->chip->set_affinity(irq, cpumask); - } else { + else { desc->status |= IRQ_MOVE_PENDING; cpumask_copy(desc->pending_mask, cpumask); } -- cgit v1.1 From 297dbf50d7ab0539cf9cf7f2a66918665a18e45e Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Wed, 15 Apr 2009 10:37:04 +0530 Subject: swap: Remove code handling bio_alloc failure with __GFP_WAIT Remove code handling bio_alloc failure with __GFP_WAIT. Signed-off-by: Nikanth Karthikesan Signed-off-by: Jens Axboe --- kernel/power/swap.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 505f319..8ba052c 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -64,8 +64,6 @@ static int submit(int rw, pgoff_t page_off, struct page *page, struct bio *bio; bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); - if (!bio) - return -ENOMEM; bio->bi_sector = page_off * (PAGE_SIZE >> 9); bio->bi_bdev = resume_bdev; bio->bi_end_io = end_swap_bio_read; -- cgit v1.1 From 5b1d07ed0e5b2707f786957c7a40eb2f399c84a8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 15 Apr 2009 19:35:01 +0100 Subject: RCU: Don't try and predeclare inline funcs as it upsets some versions of gcc Don't try and predeclare inline funcs like this: static inline void wait_migrated_callbacks(void) ... static void _rcu_barrier(enum rcu_barrier type) { ... wait_migrated_callbacks(); } ... static inline void wait_migrated_callbacks(void) { wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); } as it upsets some versions of gcc under some circumstances: kernel/rcupdate.c: In function `_rcu_barrier': kernel/rcupdate.c:125: sorry, unimplemented: inlining failed in call to 'wait_migrated_callbacks': function body not available kernel/rcupdate.c:152: sorry, unimplemented: called from here This can be dealt with by simply putting the static variables (rcu_migrate_*) at the top, and moving the implementation of the function up so that it replaces its forward declaration. Signed-off-by: David Howells Cc: Dipankar Sarma Cc: Paul E. McKenney Signed-off-by: Linus Torvalds --- kernel/rcupdate.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2c7b845..a967c9f 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -58,6 +58,10 @@ static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; int rcu_scheduler_active __read_mostly; +static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0); +static struct rcu_head rcu_migrate_head[3]; +static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq); + /* * Awaken the corresponding synchronize_rcu() instance now that a * grace period has elapsed. @@ -122,7 +126,10 @@ static void rcu_barrier_func(void *type) } } -static inline void wait_migrated_callbacks(void); +static inline void wait_migrated_callbacks(void) +{ + wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); +} /* * Orchestrate the specified type of RCU barrier, waiting for all @@ -179,21 +186,12 @@ void rcu_barrier_sched(void) } EXPORT_SYMBOL_GPL(rcu_barrier_sched); -static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0); -static struct rcu_head rcu_migrate_head[3]; -static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq); - static void rcu_migrate_callback(struct rcu_head *notused) { if (atomic_dec_and_test(&rcu_migrate_type_count)) wake_up(&rcu_migrate_wq); } -static inline void wait_migrated_callbacks(void) -{ - wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); -} - static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, unsigned long action, void *hcpu) { -- cgit v1.1 From 381512cf3d27f63f7a45b1bbe7d2d609c2ea3b74 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Tue, 14 Apr 2009 09:09:36 +0530 Subject: sched: Avoid printing sched_group::__cpu_power for default case Commit 46e0bb9c12f4 ("sched: Print sched_group::__cpu_power in sched_domain_debug") produces a messy dmesg output while attempting to print the sched_group::__cpu_power for each group in the sched_domain hierarchy. Fix this by avoid printing the __cpu_power for default cases. (i.e, __cpu_power == SCHED_LOAD_SCALE). [ Impact: reduce syslog clutter ] Reported-by: Tony Luck Signed-off-by: Gautham R Shenoy Fixed-by: Tony Luck Cc: a.p.zijlstra@chello.nl LKML-Reference: <20090414033936.GA534@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e90e70e..b902e58 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7367,8 +7367,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_or(groupmask, groupmask, sched_group_cpus(group)); cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); - printk(KERN_CONT " %s (__cpu_power = %d)", str, - group->__cpu_power); + + printk(KERN_CONT " %s", str); + if (group->__cpu_power != SCHED_LOAD_SCALE) { + printk(KERN_CONT " (__cpu_power = %d)", + group->__cpu_power); + } group = group->next; } while (group != sd->groups); -- cgit v1.1 From 79d381c9f2354b594dcab9b04dfcc0debf7294fe Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Thu, 16 Apr 2009 19:30:18 -0400 Subject: kernel/softirq.c: fix sparse warning Fix sparse warning in kernel/softirq.c. warning: do-while statement is not a compound statement Signed-off-by: H Hartley Sweeten LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/softirq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 2fecefa..b525dd3 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -472,9 +472,9 @@ void tasklet_kill(struct tasklet_struct *t) printk("Attempt to kill tasklet from interrupt\n"); while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { - do + do { yield(); - while (test_bit(TASKLET_STATE_SCHED, &t->state)); + } while (test_bit(TASKLET_STATE_SCHED, &t->state)); } tasklet_unlock_wait(t); clear_bit(TASKLET_STATE_SCHED, &t->state); -- cgit v1.1 From c8a250058656495be02c00de61e26b017c86ef00 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Apr 2009 09:40:49 +0200 Subject: lockdep: more robust lockdep_map init sequence Steven Rostedt reported: > OK, I think I figured this bug out. This is a lockdep issue with respect > to tracepoints. > > The trace points in lockdep are called all the time. Outside the lockdep > logic. But if lockdep were to trigger an error / warning (which this run > did) we might be in trouble. For new locks, like the dentry->d_lock, that > are created, they will not get a name: > > void lockdep_init_map(struct lockdep_map *lock, const char *name, > struct lock_class_key *key, int subclass) > { > if (unlikely(!debug_locks)) > return; > > When a problem is found by lockdep, debug_locks becomes false. Thus we > stop allocating names for locks. This dentry->d_lock I had, now has no > name. Worse yet, I have CONFIG_DEBUG_VM set, that scrambles non > initialized memory. Thus, when the trace point was hit, it had junk for > the lock->name, and the machine crashed. Ah, nice catch. I think we should put at least the name in regardless. Ensure we at least initialize the trivial entries of the depmap so that they can be relied upon, even when lockdep itself decided to pack up and go home. [ Impact: fix lock tracing after lockdep warnings. ] Reported-by: Steven Rostedt Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Andrew Morton Cc: Frederic Weisbecker LKML-Reference: <1239954049.23397.4156.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b0f0118..accb40c 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2490,13 +2490,20 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - if (unlikely(!debug_locks)) + lock->class_cache = NULL; +#ifdef CONFIG_LOCK_STAT + lock->cpu = raw_smp_processor_id(); +#endif + + if (DEBUG_LOCKS_WARN_ON(!name)) { + lock->name = "NULL"; return; + } + + lock->name = name; if (DEBUG_LOCKS_WARN_ON(!key)) return; - if (DEBUG_LOCKS_WARN_ON(!name)) - return; /* * Sanity check, the lock-class key must be persistent: */ @@ -2505,12 +2512,11 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, DEBUG_LOCKS_WARN_ON(1); return; } - lock->name = name; lock->key = key; - lock->class_cache = NULL; -#ifdef CONFIG_LOCK_STAT - lock->cpu = raw_smp_processor_id(); -#endif + + if (unlikely(!debug_locks)) + return; + if (subclass) register_lock_class(lock, subclass, 1); } -- cgit v1.1 From ff54250a0ebab7f90a5f848a0ba63f999830c872 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 18 Apr 2009 21:44:24 -0700 Subject: Remove 'recurse into child resources' logic from 'reserve_region_with_split()' This function is not actually used right now, since the original use case for it was done with insert_resource_expand_to_fit() instead. However, we now have another usage case that wants to basically do a "reserve IO resource, splitting around existing resources", however that one doesn't actually want the "recurse into the conflicting resource" logic at all. And since recursing into the conflicting resource was the most complex part, and isn't wanted, just remove it. Maybe we'll some day want both versions, but we can just resurrect the logic then. Tested-by: Yinghai Lu Signed-off-by: Linus Torvalds --- kernel/resource.c | 46 ++++++++++++---------------------------------- 1 file changed, 12 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index fd5d7d5..ac5f3a3 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -533,43 +533,21 @@ static void __init __reserve_region_with_split(struct resource *root, res->end = end; res->flags = IORESOURCE_BUSY; - for (;;) { - conflict = __request_resource(parent, res); - if (!conflict) - break; - if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) - continue; - } - - /* Uhhuh, that didn't work out.. */ - kfree(res); - res = NULL; - break; - } - - if (!res) { - /* failed, split and try again */ - - /* conflict covered whole area */ - if (conflict->start <= start && conflict->end >= end) - return; + conflict = __request_resource(parent, res); + if (!conflict) + return; - if (conflict->start > start) - __reserve_region_with_split(root, start, conflict->start-1, name); - if (!(conflict->flags & IORESOURCE_BUSY)) { - resource_size_t common_start, common_end; + /* failed, split and try again */ + kfree(res); - common_start = max(conflict->start, start); - common_end = min(conflict->end, end); - if (common_start < common_end) - __reserve_region_with_split(root, common_start, common_end, name); - } - if (conflict->end < end) - __reserve_region_with_split(root, conflict->end+1, end, name); - } + /* conflict covered whole area */ + if (conflict->start <= start && conflict->end >= end) + return; + if (conflict->start > start) + __reserve_region_with_split(root, start, conflict->start-1, name); + if (conflict->end < end) + __reserve_region_with_split(root, conflict->end+1, end, name); } void __init reserve_region_with_split(struct resource *root, -- cgit v1.1 From 6a7c7eaf71b636f197d73b381a2ab729ebdcfb2e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 19 Apr 2009 20:08:42 +0200 Subject: PM/Suspend: Introduce two new platform callbacks to avoid breakage Commit 900af0d973856d6feb6fc088c2d0d3fde57707d3 (PM: Change suspend code ordering) changed the ordering of suspend code in such a way that the platform .prepare() callback is now executed after the device drivers' late suspend callbacks have run. Unfortunately, this turns out to break ARM platforms that need to talk via I2C to power control devices during the .prepare() callback. For this reason introduce two new platform suspend callbacks, .prepare_late() and .wake(), that will be called just prior to disabling non-boot CPUs and right after bringing them back on line, respectively, and use them instead of .prepare() and .finish() for ACPI suspend. Make the PM core execute the .prepare() and .finish() platform suspend callbacks where they were executed previously (that is, right after calling the regular suspend methods provided by device drivers and right before executing their regular resume methods, respectively). It is not necessary to make analogous changes to the hibernation code and data structures at the moment, because they are only used by ACPI platforms. Signed-off-by: Rafael J. Wysocki Reported-by: Russell King Acked-by: Len Brown --- kernel/power/main.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index f172f41..f99ed6a 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -291,20 +291,26 @@ static int suspend_enter(suspend_state_t state) device_pm_lock(); + if (suspend_ops->prepare) { + error = suspend_ops->prepare(); + if (error) + goto Done; + } + error = device_power_down(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); - goto Done; + goto Platfrom_finish; } - if (suspend_ops->prepare) { - error = suspend_ops->prepare(); + if (suspend_ops->prepare_late) { + error = suspend_ops->prepare_late(); if (error) goto Power_up_devices; } if (suspend_test(TEST_PLATFORM)) - goto Platfrom_finish; + goto Platform_wake; error = disable_nonboot_cpus(); if (error || suspend_test(TEST_CPUS)) @@ -326,13 +332,17 @@ static int suspend_enter(suspend_state_t state) Enable_cpus: enable_nonboot_cpus(); - Platfrom_finish: - if (suspend_ops->finish) - suspend_ops->finish(); + Platform_wake: + if (suspend_ops->wake) + suspend_ops->wake(); Power_up_devices: device_power_up(PMSG_RESUME); + Platfrom_finish: + if (suspend_ops->finish) + suspend_ops->finish(); + Done: device_pm_unlock(); -- cgit v1.1 From 24b6f16ecf37f918a1934d590e9e71c100d6388f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Apr 2009 03:25:41 -0400 Subject: No need for crossing to mountpoint in audit_tag_tree() is_under() will DTRT anyway. And yes, is_subdir() behaviour is intentional. Signed-off-by: Al Viro --- kernel/audit_tree.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 917ab95..6e73517 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -734,9 +734,6 @@ int audit_tag_tree(char *old, char *new) dentry = dget(path.dentry); path_put(&path); - if (dentry == tagged->mnt_root && dentry == mnt->mnt_root) - follow_up(&mnt, &dentry); - list_add_tail(&list, &tagged->mnt_list); mutex_lock(&audit_filter_mutex); -- cgit v1.1 From 8e19608e8b5c001e4a66ce482edc474f05fb7355 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 21 Apr 2009 12:24:00 -0700 Subject: clocksource: pass clocksource to read() callback Pass clocksource pointer to the read() callback for clocksources. This allows us to share the callback between multiple instances. [hugh@veritas.com: fix powerpc build of clocksource pass clocksource mods] [akpm@linux-foundation.org: cleanup] Signed-off-by: Magnus Damm Acked-by: John Stultz Cc: Thomas Gleixner Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/clocksource.c | 8 ++++---- kernel/time/jiffies.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c46c931..ecfd7b5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -181,12 +181,12 @@ static void clocksource_watchdog(unsigned long data) resumed = test_and_clear_bit(0, &watchdog_resumed); - wdnow = watchdog->read(); + wdnow = watchdog->read(watchdog); wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); watchdog_last = wdnow; list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { - csnow = cs->read(); + csnow = cs->read(cs); if (unlikely(resumed)) { cs->wd_last = csnow; @@ -247,7 +247,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) list_add(&cs->wd_list, &watchdog_list); if (!started && watchdog) { - watchdog_last = watchdog->read(); + watchdog_last = watchdog->read(watchdog); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); @@ -268,7 +268,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) cse->flags &= ~CLOCK_SOURCE_WATCHDOG; /* Start if list is not empty */ if (!list_empty(&watchdog_list)) { - watchdog_last = watchdog->read(); + watchdog_last = watchdog->read(watchdog); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 06f1975..c3f6c30 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -50,7 +50,7 @@ */ #define JIFFIES_SHIFT 8 -static cycle_t jiffies_read(void) +static cycle_t jiffies_read(struct clocksource *cs) { return (cycle_t) jiffies; } -- cgit v1.1 From 4614e6adafa2c5e6c3a9c245af2807fa7bc5117a Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 21 Apr 2009 12:24:02 -0700 Subject: clocksource: add enable() and disable() callbacks Add enable() and disable() callbacks for clocksources. This allows us to put unused clocksources in power save mode. The functions clocksource_enable() and clocksource_disable() wrap the callbacks and are inserted in the timekeeping code to enable before use and disable after switching to a new clocksource. Signed-off-by: Magnus Damm Acked-by: John Stultz Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/timekeeping.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 900f1b6..687dff4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -182,7 +182,7 @@ EXPORT_SYMBOL(do_settimeofday); */ static void change_clocksource(void) { - struct clocksource *new; + struct clocksource *new, *old; new = clocksource_get_next(); @@ -191,11 +191,16 @@ static void change_clocksource(void) clocksource_forward_now(); - new->raw_time = clock->raw_time; + if (clocksource_enable(new)) + return; + new->raw_time = clock->raw_time; + old = clock; clock = new; + clocksource_disable(old); + clock->cycle_last = 0; - clock->cycle_last = clocksource_read(new); + clock->cycle_last = clocksource_read(clock); clock->error = 0; clock->xtime_nsec = 0; clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); @@ -292,6 +297,7 @@ void __init timekeeping_init(void) ntp_init(); clock = clocksource_get_next(); + clocksource_enable(clock); clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); clock->cycle_last = clocksource_read(clock); -- cgit v1.1 From b48ccb095a0c9257241261ec2bd1cbb1bdabc48b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2009 09:36:52 +0200 Subject: locking: clarify kernel-taint warning message Andi Kleen reported this message triggering on non-lockdep kernels: Disabling lockdep due to kernel taint Clarify the message to say 'lock debugging' - debug_locks_off() turns off all things lock debugging, not just lockdep. [ Impact: change kernel warning message text ] Reported-by: Andi Kleen Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 934fb37..3dcaa16 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -221,7 +221,7 @@ void add_taint(unsigned flag) * post-warning case. */ if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off()) - printk(KERN_WARNING "Disabling lockdep due to kernel taint\n"); + printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); } -- cgit v1.1 From 418df63c2d94f238ac7e1d1d53be35dd6b7a7252 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Wed, 22 Apr 2009 12:01:49 +0100 Subject: Delete slow-work timers properly Slow-work appears to delete its timer as soon as the first user unregisters, even though other users could be active. At the same time, it never seems to delete slow_work_oom_timer. Arrange for both to happen in the shutdown path. Signed-off-by: Jonathan Corbet Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/slow-work.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/slow-work.c b/kernel/slow-work.c index cf2bc01..b28d1913 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -609,14 +609,14 @@ void slow_work_unregister_user(void) if (slow_work_user_count == 0) { printk(KERN_NOTICE "Slow work thread pool: Shutting down\n"); slow_work_threads_should_exit = true; + del_timer_sync(&slow_work_cull_timer); + del_timer_sync(&slow_work_oom_timer); wake_up_all(&slow_work_thread_wq); wait_for_completion(&slow_work_last_thread_exited); printk(KERN_NOTICE "Slow work thread pool:" " Shut down complete\n"); } - del_timer_sync(&slow_work_cull_timer); - mutex_unlock(&slow_work_user_lock); } EXPORT_SYMBOL(slow_work_unregister_user); -- cgit v1.1 From 0c8454f56623505a99463405fd7d5664adfbb094 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 25 Apr 2009 00:16:06 +0200 Subject: PM/Hibernate: Fix waiting for image device to appear on resume Commit c751085943362143f84346d274e0011419c84202 ("PM/Hibernate: Wait for SCSI devices scan to complete during resume") added a call to scsi_complete_async_scans() to software_resume(), so that it waited for the SCSI scanning to complete, but the call was added at a wrong place. Namely, it should have been added after wait_for_device_probe(), which is called only if the image partition hasn't been specified yet. Also, it's reasonable to check if the image partition is present and only wait for the device probing and SCSI scanning to complete if it is not the case. Additionally, since noresume is checked right at the beginning of software_resume() and the function returns immediately if it's set, it doesn't make sense to check it once again later. Signed-off-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 51 +++++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 0854770..e71ca9c 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -646,13 +646,6 @@ static int software_resume(void) return 0; /* - * We can't depend on SCSI devices being available after loading one of - * their modules if scsi_complete_async_scans() is not called and the - * resume device usually is a SCSI one. - */ - scsi_complete_async_scans(); - - /* * name_to_dev_t() below takes a sysfs buffer mutex when sysfs * is configured into the kernel. Since the regular hibernate * trigger path is via sysfs which takes a buffer mutex before @@ -663,32 +656,42 @@ static int software_resume(void) * here to avoid lockdep complaining. */ mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING); + + if (swsusp_resume_device) + goto Check_image; + + if (!strlen(resume_file)) { + error = -ENOENT; + goto Unlock; + } + + pr_debug("PM: Checking image partition %s\n", resume_file); + + /* Check if the device is there */ + swsusp_resume_device = name_to_dev_t(resume_file); if (!swsusp_resume_device) { - if (!strlen(resume_file)) { - mutex_unlock(&pm_mutex); - return -ENOENT; - } /* * Some device discovery might still be in progress; we need * to wait for this to finish. */ wait_for_device_probe(); + /* + * We can't depend on SCSI devices being available after loading + * one of their modules until scsi_complete_async_scans() is + * called and the resume device usually is a SCSI one. + */ + scsi_complete_async_scans(); + swsusp_resume_device = name_to_dev_t(resume_file); - pr_debug("PM: Resume from partition %s\n", resume_file); - } else { - pr_debug("PM: Resume from partition %d:%d\n", - MAJOR(swsusp_resume_device), - MINOR(swsusp_resume_device)); + if (!swsusp_resume_device) { + error = -ENODEV; + goto Unlock; + } } - if (noresume) { - /** - * FIXME: If noresume is specified, we need to find the - * partition and reset it back to normal swap space. - */ - mutex_unlock(&pm_mutex); - return 0; - } + Check_image: + pr_debug("PM: Resume from partition %d:%d\n", + MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); pr_debug("PM: Checking hibernation image.\n"); error = swsusp_check(); -- cgit v1.1 From cad81bc2529ab8c62b6fdc83a1c0c7f4a87209eb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 27 Apr 2009 01:41:34 +0200 Subject: ptrace: ptrace_attach: fix the usage of ->cred_exec_mutex ptrace_attach() needs task->cred_exec_mutex, not current->cred_exec_mutex. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Acked-by: David Howells Signed-off-by: James Morris --- kernel/ptrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index dfcd83c..0692ab5 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -188,7 +188,7 @@ int ptrace_attach(struct task_struct *task) /* Protect exec's credential calculations against our interference; * SUID, SGID and LSM creds get determined differently under ptrace. */ - retval = mutex_lock_interruptible(¤t->cred_exec_mutex); + retval = mutex_lock_interruptible(&task->cred_exec_mutex); if (retval < 0) goto out; @@ -232,7 +232,7 @@ repeat: bad: write_unlock_irqrestore(&tasklist_lock, flags); task_unlock(task); - mutex_unlock(¤t->cred_exec_mutex); + mutex_unlock(&task->cred_exec_mutex); out: return retval; } -- cgit v1.1 From 7267fa6819467669f5cc2ba81a615dcc88158b4b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 00:16:21 -0400 Subject: tracing: fix ref count in splice pages The pages allocated for the splice binary buffer did not initialize the ref count correctly. This caused pages not to be freed and causes a drastic memory leak. Thanks to logdev I was able to trace the tracer to find where the leak was. [ Impact: stop memory leak when using splice ] Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1ce5dc6..a884c09 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3448,6 +3448,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if (!ref) break; + ref->ref = 1; ref->buffer = info->tr->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer); if (!ref->page) { -- cgit v1.1 From f5f293a4e3d0a0c52cec31de6762c95050156516 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Apr 2009 14:44:49 +0200 Subject: sched: account system time properly Andrew Gallatin reported that IRQ and SOFTIRQ times were sometime not reported correctly on recent kernels, and even bisected to commit 457533a7d3402d1d91fbc125c8bd1bd16dcd3cd4 ([PATCH] fix scaled & unscaled cputime accounting) as the first bad commit. Further analysis pointed that commit 79741dd35713ff4f6fd0eafd59fa94e8a4ba922d ([PATCH] idle cputime accounting) was the real cause of the problem. account_process_tick() was not taking into account timer IRQ interrupting the idle task servicing a hard or soft irq. On mostly idle cpu, irqs were thus not accounted and top or mpstat could tell user/admin that cpu was 100 % idle, 0.00 % irq, 0.00 % softirq, while it was not. [ Impact: fix occasionally incorrect CPU statistics in top/mpstat ] Reported-by: Andrew Gallatin Re-reported-by: Andrew Morton Signed-off-by: Eric Dumazet Acked-by: Martin Schwidefsky Cc: rick.jones2@hp.com Cc: brice@myri.com Cc: Paul Mackerras Cc: Benjamin Herrenschmidt LKML-Reference: <49F84BC1.7080602@cosmosbay.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b902e58..26efa47 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4732,7 +4732,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (user_tick) account_user_time(p, one_jiffy, one_jiffy_scaled); - else if (p != rq->idle) + else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) account_system_time(p, HARDIRQ_OFFSET, one_jiffy, one_jiffy_scaled); else -- cgit v1.1 From 6e85c5ba73c07b990798087e9b858c065db2b234 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Wed, 29 Apr 2009 19:14:32 -0400 Subject: kernel/posix-cpu-timers.c: fix sparse warning Sparse reports the following in kernel/posix-cpu-timers.c: warning: symbol 'firing' shadows an earlier one Signed-off-by: H Hartley Sweeten Cc: Subrata Modak LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c9dcf98..bece7c0 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1420,19 +1420,19 @@ void run_posix_cpu_timers(struct task_struct *tsk) * timer call will interfere. */ list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { - int firing; + int cpu_firing; + spin_lock(&timer->it_lock); list_del_init(&timer->it.cpu.entry); - firing = timer->it.cpu.firing; + cpu_firing = timer->it.cpu.firing; timer->it.cpu.firing = 0; /* * The firing flag is -1 if we collided with a reset * of the timer, which already reported this * almost-firing as an overrun. So don't generate an event. */ - if (likely(firing >= 0)) { + if (likely(cpu_firing >= 0)) cpu_timer_fire(timer); - } spin_unlock(&timer->it_lock); } } -- cgit v1.1 From d7226fb6ec5d4f325e4e7fd905894e2ea3eb3ae0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 May 2009 15:16:04 +0200 Subject: Revert "genirq: assert that irq handlers are indeed running in hardirq context" This reverts commit 044d408409cc4e1bc75c886e27ca85c270db104c. The commit added a warning when handle_IRQ_event() is called outside of hard interrupt context. This breaks the generic tasklet based interrupt resend mechanism which is used when the hardware has no way to retrigger the interrupt. So we get a warning for a use case which is correct and worked for years. Remove it. Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d82142b..26e0875 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -363,8 +363,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; - WARN_ONCE(!in_irq(), "BUG: IRQ handler called from non-hardirq context!"); - if (!(action->flags & IRQF_DISABLED)) local_irq_enable_in_hardirq(); -- cgit v1.1 From 74a03b69d1b5ce00a568e142ca97e76b7f5239c6 Mon Sep 17 00:00:00 2001 From: john stultz Date: Fri, 1 May 2009 13:10:25 -0700 Subject: clockevents: prevent endless loop in tick_handle_periodic() tick_handle_periodic() can lock up hard when a one shot clock event device is used in combination with jiffies clocksource. Avoid an endless loop issue by requiring that a highres valid clocksource be installed before we call tick_periodic() in a loop when using ONESHOT mode. The result is we will only increment jiffies once per interrupt until a continuous hardware clocksource is available. Without this, we can run into a endless loop, where each cycle through the loop, jiffies is updated which increments time by tick_period or more (due to clock steering), which can cause the event programming to think the next event was before the newly incremented time and fail causing tick_periodic() to be called again and the whole process loops forever. [ Impact: prevent hard lock up ] Signed-off-by: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- kernel/time/tick-common.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 21a5ca8..83c4417 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -93,7 +93,17 @@ void tick_handle_periodic(struct clock_event_device *dev) for (;;) { if (!clockevents_program_event(dev, next, ktime_get())) return; - tick_periodic(cpu); + /* + * Have to be careful here. If we're in oneshot mode, + * before we call tick_periodic() in a loop, we need + * to be sure we're using a real hardware clocksource. + * Otherwise we could get trapped in an infinite + * loop, as the tick_periodic() increments jiffies, + * when then will increment time, posibly causing + * the loop to trigger again and again. + */ + if (timekeeping_valid_for_hres()) + tick_periodic(cpu); next = ktime_add(next, tick_period); } } -- cgit v1.1 From 9e4a5bda89034502fb144331e71a0efdfd5fae97 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 30 Apr 2009 15:08:57 -0700 Subject: mm: prevent divide error for small values of vm_dirty_bytes Avoid setting less than two pages for vm_dirty_bytes: this is necessary to avoid potential division by 0 (like the following) in get_dirty_limits(). [ 49.951610] divide error: 0000 [#1] PREEMPT SMP [ 49.952195] last sysfs file: /sys/devices/pci0000:00/0000:00:01.1/host0/target0:0:0/0:0:0:0/block/sda/uevent [ 49.952195] CPU 1 [ 49.952195] Modules linked in: pcspkr [ 49.952195] Pid: 3064, comm: dd Not tainted 2.6.30-rc3 #1 [ 49.952195] RIP: 0010:[] [] get_dirty_limits+0xe9/0x2c0 [ 49.952195] RSP: 0018:ffff88001de03a98 EFLAGS: 00010202 [ 49.952195] RAX: 00000000000000c0 RBX: ffff88001de03b80 RCX: 28f5c28f5c28f5c3 [ 49.952195] RDX: 0000000000000000 RSI: 00000000000000c0 RDI: 0000000000000000 [ 49.952195] RBP: ffff88001de03ae8 R08: 0000000000000000 R09: 0000000000000000 [ 49.952195] R10: ffff88001ddda9a0 R11: 0000000000000001 R12: 0000000000000001 [ 49.952195] R13: ffff88001fbc8218 R14: ffff88001de03b70 R15: ffff88001de03b78 [ 49.952195] FS: 00007fe9a435b6f0(0000) GS:ffff8800025d9000(0000) knlGS:0000000000000000 [ 49.952195] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 49.952195] CR2: 00007fe9a39ab000 CR3: 000000001de38000 CR4: 00000000000006e0 [ 49.952195] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 49.952195] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 49.952195] Process dd (pid: 3064, threadinfo ffff88001de02000, task ffff88001ddda250) [ 49.952195] Stack: [ 49.952195] ffff88001fa0de00 ffff88001f2dbd70 ffff88001f9fe800 000080b900000000 [ 49.952195] 00000000000000c0 ffff8800027a6100 0000000000000400 ffff88001fbc8218 [ 49.952195] 0000000000000000 0000000000000600 ffff88001de03bb8 ffffffff802d3ed7 [ 49.952195] Call Trace: [ 49.952195] [] balance_dirty_pages_ratelimited_nr+0x1d7/0x3f0 [ 49.952195] [] ? ext3_writeback_write_end+0x9e/0x120 [ 49.952195] [] generic_file_buffered_write+0x12f/0x330 [ 49.952195] [] __generic_file_aio_write_nolock+0x26d/0x460 [ 49.952195] [] ? generic_file_aio_write+0x52/0xd0 [ 49.952195] [] generic_file_aio_write+0x69/0xd0 [ 49.952195] [] ext3_file_write+0x26/0xc0 [ 49.952195] [] do_sync_write+0xf1/0x140 [ 49.952195] [] ? get_lock_stats+0x2a/0x60 [ 49.952195] [] ? autoremove_wake_function+0x0/0x40 [ 49.952195] [] vfs_write+0xcb/0x190 [ 49.952195] [] sys_write+0x50/0x90 [ 49.952195] [] system_call_fastpath+0x16/0x1b [ 49.952195] Code: 00 00 00 2b 05 09 1c 17 01 48 89 c6 49 0f af f4 48 c1 ee 02 48 89 f0 48 f7 e1 48 89 d6 31 d2 48 c1 ee 02 48 0f af 75 d0 48 89 f0 <48> f7 f7 41 8b 95 ac 01 00 00 48 89 c7 49 0f af d4 48 c1 ea 02 [ 49.952195] RIP [] get_dirty_limits+0xe9/0x2c0 [ 49.952195] RSP [ 50.096523] ---[ end trace 008d7aa02f244d7b ]--- Signed-off-by: Andrea Righi Cc: Peter Zijlstra Cc: David Rientjes Cc: Dave Chinner Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e3d2c7d..ea78fa1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -103,6 +103,9 @@ static unsigned long one_ul = 1; static int one_hundred = 100; static int one_thousand = 1000; +/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; + /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; static int minolduid; @@ -1006,7 +1009,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &dirty_bytes_handler, .strategy = &sysctl_intvec, - .extra1 = &one_ul, + .extra1 = &dirty_bytes_min, }, { .procname = "dirty_writeback_centisecs", -- cgit v1.1