From 517a92c4e19fcea815332d3155e9fb7723251274 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 14 Feb 2008 09:02:13 +0100 Subject: panic: print more informative messages on stackprotect failure pointed out by pageexec@freemail.hu: we just simply panic() when there's a stackprotector attack - giving the attacked person no information about what kernel code the attack went against. print out the attacked function. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/panic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 425567f..f236001 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -327,7 +327,8 @@ EXPORT_SYMBOL(warn_on_slowpath); */ void __stack_chk_fail(void) { - panic("stack-protector: Kernel stack is corrupted"); + panic("stack-protector: Kernel stack is corrupted in: %p\n", + __builtin_return_address(0)); } EXPORT_SYMBOL(__stack_chk_fail); #endif -- cgit v1.1 From 5cb273013e182a35e7db614d3e20a144cba71e53 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 14 Feb 2008 09:07:01 +0100 Subject: panic: print out stacktrace if DEBUG_BUGVERBOSE if CONFIG_DEBUG_BUGVERBOSE is set then the user most definitely wanted to see as much information about kernel crashes as possible - so give them at least a stack dump. this is particularly useful for stackprotector related panics, where the stacktrace can give us the exact location of the (attempted) attack. Pointed out by pageexec@freemail.hu in the stackprotector breakage threads. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/panic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index f236001..17aad57 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -80,6 +80,9 @@ NORET_TYPE void panic(const char * fmt, ...) vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); +#ifdef CONFIG_DEBUG_BUGVERBOSE + dump_stack(); +#endif bust_spinlocks(0); /* -- cgit v1.1 From 54371a43a66f4477889769b4fa00df936855dc8f Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 15 Feb 2008 15:33:12 -0800 Subject: x86: add CONFIG_CC_STACKPROTECTOR self-test This patch adds a simple self-test capability to the stackprotector feature. The test deliberately overflows a stack buffer and then checks if the canary trap function gets called. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/panic.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 17aad57..50cf925 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -324,14 +324,82 @@ EXPORT_SYMBOL(warn_on_slowpath); #endif #ifdef CONFIG_CC_STACKPROTECTOR + +static unsigned long __stack_check_testing; +/* + * Self test function for the stack-protector feature. + * This test requires that the local variable absolutely has + * a stack slot, hence the barrier()s. + */ +static noinline void __stack_chk_test_func(void) +{ + unsigned long foo; + barrier(); + /* + * we need to make sure we're not about to clobber the return address, + * while real exploits do this, it's unhealthy on a running system. + * Besides, if we would, the test is already failed anyway so + * time to pull the emergency brake on it. + */ + if ((unsigned long)__builtin_return_address(0) == + *(((unsigned long *)&foo)+1)) { + printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); + return; + } +#ifdef CONFIG_FRAME_POINTER + /* We also don't want to clobber the frame pointer */ + if ((unsigned long)__builtin_return_address(0) == + *(((unsigned long *)&foo)+2)) { + printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); + return; + } +#endif + barrier(); + if (current->stack_canary == *(((unsigned long *)&foo)+1)) + *(((unsigned long *)&foo)+1) = 0; + else + printk(KERN_ERR "No -fstack-protector canary found\n"); + barrier(); +} + +static int __stack_chk_test(void) +{ + printk(KERN_INFO "Testing -fstack-protector-all feature\n"); + __stack_check_testing = (unsigned long)&__stack_chk_test_func; + __stack_chk_test_func(); + if (__stack_check_testing) { + printk(KERN_ERR "-fstack-protector-all test failed\n"); + WARN_ON(1); + } + return 0; +} /* * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value */ void __stack_chk_fail(void) { + if (__stack_check_testing == (unsigned long)&__stack_chk_test_func) { + long delta; + + delta = (unsigned long)__builtin_return_address(0) - + __stack_check_testing; + /* + * The test needs to happen inside the test function, so + * check if the return address is close to that function. + * The function is only 2 dozen bytes long, but keep a wide + * safety margin to avoid panic()s for normal users regardless + * of the quality of the compiler. + */ + if (delta >= 0 && delta <= 400) { + __stack_check_testing = 0; + return; + } + } panic("stack-protector: Kernel stack is corrupted in: %p\n", __builtin_return_address(0)); } EXPORT_SYMBOL(__stack_chk_fail); + +late_initcall(__stack_chk_test); #endif -- cgit v1.1 From b719ac56c0032bc1602914c6ea70b0f1581b08c7 Mon Sep 17 00:00:00 2001 From: Daniel Walker Date: Mon, 14 Apr 2008 10:03:50 -0700 Subject: panic.c: fix whitespace additions trivial: remove white space addition in stack protector Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/panic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 50cf925..866be9b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -341,14 +341,14 @@ static noinline void __stack_chk_test_func(void) * Besides, if we would, the test is already failed anyway so * time to pull the emergency brake on it. */ - if ((unsigned long)__builtin_return_address(0) == + if ((unsigned long)__builtin_return_address(0) == *(((unsigned long *)&foo)+1)) { printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); return; } #ifdef CONFIG_FRAME_POINTER /* We also don't want to clobber the frame pointer */ - if ((unsigned long)__builtin_return_address(0) == + if ((unsigned long)__builtin_return_address(0) == *(((unsigned long *)&foo)+2)) { printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); return; -- cgit v1.1 From b40a4392a3c262e0d1b5379b4e142a8eefa63439 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 18 Apr 2008 06:16:45 -0700 Subject: stackprotector: turn not having the right gcc into a #warning If the user selects the stack-protector config option, but does not have a gcc that has the right bits enabled (for example because it isn't build with a glibc that supports TLS, as is common for cross-compilers, but also because it may be too old), then the runtime test fails right now. This patch adds a warning message for this scenario. This warning accomplishes two goals 1) the user is informed that the security option he selective isn't available 2) the user is suggested to turn of the CONFIG option that won't work for him, and would make the runtime test fail anyway. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/panic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 866be9b..6729e3f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -325,6 +325,9 @@ EXPORT_SYMBOL(warn_on_slowpath); #ifdef CONFIG_CC_STACKPROTECTOR +#ifndef GCC_HAS_SP +#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this. +#endif static unsigned long __stack_check_testing; /* * Self test function for the stack-protector feature. -- cgit v1.1 From 7c9f8861e6c9c839f913e49b98c3854daca18f27 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 22 Apr 2008 16:38:23 -0500 Subject: stackprotector: use canary at end of stack to indicate overruns at oops time (Updated with a common max-stack-used checker that knows about the canary, as suggested by Joe Perches) Use a canary at the end of the stack to clearly indicate at oops time whether the stack has ever overflowed. This is a very simple implementation with a couple of drawbacks: 1) a thread may legitimately use exactly up to the last word on the stack -- but the chances of doing this and then oopsing later seem slim 2) it's possible that the stack usage isn't dense enough that the canary location could get skipped over -- but the worst that happens is that we don't flag the overrun -- though this happens fairly often in my testing :( With the code in place, an intentionally-bloated stack oops might do: BUG: unable to handle kernel paging request at ffff8103f84cc680 IP: [] update_curr+0x9a/0xa8 PGD 8063 PUD 0 Thread overran stack or stack corrupted Oops: 0000 [1] SMP CPU 0 ... ... unless the stack overrun is so bad that it corrupts some other thread. Signed-off-by: Eric Sandeen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/exit.c | 5 +---- kernel/fork.c | 5 +++++ kernel/sched.c | 7 +------ 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 8f6185e..fb8de6c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -899,12 +899,9 @@ static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; - unsigned long *n = end_of_stack(current); unsigned long free; - while (*n == 0) - n++; - free = (unsigned long)n - (unsigned long)end_of_stack(current); + free = stack_not_used(current); if (free >= lowest_to_date) return; diff --git a/kernel/fork.c b/kernel/fork.c index 19908b2..d428336 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -186,6 +187,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; + unsigned long *stackend; + int err; prepare_to_copy(orig); @@ -211,6 +214,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) goto out; setup_thread_stack(tsk, orig); + stackend = end_of_stack(tsk); + *stackend = STACK_END_MAGIC; /* for overflow detection */ #ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); diff --git a/kernel/sched.c b/kernel/sched.c index cfa222a..a964ed9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5748,12 +5748,7 @@ void sched_show_task(struct task_struct *p) printk(KERN_CONT " %016lx ", thread_saved_pc(p)); #endif #ifdef CONFIG_DEBUG_STACK_USAGE - { - unsigned long *n = end_of_stack(p); - while (!*n) - n++; - free = (unsigned long)n - (unsigned long)end_of_stack(p); - } + free = stack_not_used(p); #endif printk(KERN_CONT "%5lu %5d %6d\n", free, task_pid_nr(p), task_pid_nr(p->real_parent)); -- cgit v1.1 From aa92db14270b79f0f91a9060b547a46f9e2639da Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 11 Jul 2008 05:09:55 -0700 Subject: stackprotector: better self-test check stackprotector functionality by manipulating the canary briefly during bootup. far more robust than trying to overflow the stack. (which is architecture dependent, etc.) Signed-off-by: Ingo Molnar --- kernel/panic.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 6729e3f..28153ae 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -347,22 +347,18 @@ static noinline void __stack_chk_test_func(void) if ((unsigned long)__builtin_return_address(0) == *(((unsigned long *)&foo)+1)) { printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); - return; } #ifdef CONFIG_FRAME_POINTER /* We also don't want to clobber the frame pointer */ if ((unsigned long)__builtin_return_address(0) == *(((unsigned long *)&foo)+2)) { printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); - return; } #endif - barrier(); - if (current->stack_canary == *(((unsigned long *)&foo)+1)) - *(((unsigned long *)&foo)+1) = 0; - else + if (current->stack_canary != *(((unsigned long *)&foo)+1)) printk(KERN_ERR "No -fstack-protector canary found\n"); - barrier(); + + current->stack_canary = ~current->stack_canary; } static int __stack_chk_test(void) @@ -373,7 +369,8 @@ static int __stack_chk_test(void) if (__stack_check_testing) { printk(KERN_ERR "-fstack-protector-all test failed\n"); WARN_ON(1); - } + }; + current->stack_canary = ~current->stack_canary; return 0; } /* -- cgit v1.1 From af9ff7868f0f76d3364351b1641b9dfa99588e77 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 12 Jul 2008 09:36:38 -0700 Subject: x86: simplify stackprotector self-check Clean up the code by removing no longer needed code; make sure the pda is updated and kept in sync Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- kernel/panic.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 28153ae..87445a8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -328,37 +328,21 @@ EXPORT_SYMBOL(warn_on_slowpath); #ifndef GCC_HAS_SP #warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this. #endif + static unsigned long __stack_check_testing; + /* * Self test function for the stack-protector feature. * This test requires that the local variable absolutely has - * a stack slot, hence the barrier()s. + * a stack slot. */ static noinline void __stack_chk_test_func(void) { - unsigned long foo; - barrier(); - /* - * we need to make sure we're not about to clobber the return address, - * while real exploits do this, it's unhealthy on a running system. - * Besides, if we would, the test is already failed anyway so - * time to pull the emergency brake on it. - */ - if ((unsigned long)__builtin_return_address(0) == - *(((unsigned long *)&foo)+1)) { - printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); - } -#ifdef CONFIG_FRAME_POINTER - /* We also don't want to clobber the frame pointer */ - if ((unsigned long)__builtin_return_address(0) == - *(((unsigned long *)&foo)+2)) { - printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); - } -#endif - if (current->stack_canary != *(((unsigned long *)&foo)+1)) - printk(KERN_ERR "No -fstack-protector canary found\n"); + unsigned long dummy_buffer[64]; /* force gcc to use the canary */ current->stack_canary = ~current->stack_canary; + refresh_stack_canary(); + dummy_buffer[3] = 1; /* fool gcc into keeping the variable */ } static int __stack_chk_test(void) @@ -371,6 +355,7 @@ static int __stack_chk_test(void) WARN_ON(1); }; current->stack_canary = ~current->stack_canary; + refresh_stack_canary(); return 0; } /* -- cgit v1.1 From 4f962d4d65923d7b722192e729840cfb79af0a5a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 13 Jul 2008 21:42:44 +0200 Subject: stackprotector: remove self-test turns out gcc generates such stackprotector-failure sequences in certain circumstances: movq -8(%rbp), %rax # D.16032, xorq %gs:40, %rax #, jne .L17 #, leave ret .L17: call __stack_chk_fail # .size __stack_chk_test_func, .-__stack_chk_test_func .section .init.text,"ax",@progbits .type panic_setup, @function panic_setup: pushq %rbp # note that there's no jump back to the failing context after the call to __stack_chk_fail - i.e. it has a ((noreturn)) attribute. Which is fair enough in the normal case but kills the self-test. (as we cannot reliably return in the self-test) Signed-off-by: Ingo Molnar --- kernel/panic.c | 47 ----------------------------------------------- 1 file changed, 47 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 87445a8..c35c9ec 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -329,62 +329,15 @@ EXPORT_SYMBOL(warn_on_slowpath); #warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this. #endif -static unsigned long __stack_check_testing; - -/* - * Self test function for the stack-protector feature. - * This test requires that the local variable absolutely has - * a stack slot. - */ -static noinline void __stack_chk_test_func(void) -{ - unsigned long dummy_buffer[64]; /* force gcc to use the canary */ - - current->stack_canary = ~current->stack_canary; - refresh_stack_canary(); - dummy_buffer[3] = 1; /* fool gcc into keeping the variable */ -} - -static int __stack_chk_test(void) -{ - printk(KERN_INFO "Testing -fstack-protector-all feature\n"); - __stack_check_testing = (unsigned long)&__stack_chk_test_func; - __stack_chk_test_func(); - if (__stack_check_testing) { - printk(KERN_ERR "-fstack-protector-all test failed\n"); - WARN_ON(1); - }; - current->stack_canary = ~current->stack_canary; - refresh_stack_canary(); - return 0; -} /* * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value */ void __stack_chk_fail(void) { - if (__stack_check_testing == (unsigned long)&__stack_chk_test_func) { - long delta; - - delta = (unsigned long)__builtin_return_address(0) - - __stack_check_testing; - /* - * The test needs to happen inside the test function, so - * check if the return address is close to that function. - * The function is only 2 dozen bytes long, but keep a wide - * safety margin to avoid panic()s for normal users regardless - * of the quality of the compiler. - */ - if (delta >= 0 && delta <= 400) { - __stack_check_testing = 0; - return; - } - } panic("stack-protector: Kernel stack is corrupted in: %p\n", __builtin_return_address(0)); } EXPORT_SYMBOL(__stack_chk_fail); -late_initcall(__stack_chk_test); #endif -- cgit v1.1 From 1cc4fff0b360aeffeedb7d6db5089d88dd861700 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 22 Dec 2008 02:24:48 +0100 Subject: hrtimers: increase clock min delta threshold while interrupt hanging Impact: avoid timer IRQ hanging slow systems While using the function graph tracer on a virtualized system, the hrtimer_interrupt can hang the system on an infinite loop. This can be caused in several situations: - the hardware is very slow and HZ is set too high - something intrusive is slowing the system down (tracing under emulation) ... and the next clock events to program are always before the current time. This patch implements a reasonable compromise: if such a situation is detected, we share the CPUs time in 1/4 to process the hrtimer interrupts. This is enough to let the system running without serious starvation. It has been successfully tested under VirtualBox with 1000 HZ and 100 HZ with function graph tracer launched. On both cases, the clock events were increased until about 25 ms periodic ticks, which means 40 HZ. So we change a hard to debug hang into a warning message and a system that still manages to limp along. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index bda9cb9..c2a69b8 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1171,6 +1171,29 @@ static void __run_hrtimer(struct hrtimer *timer) #ifdef CONFIG_HIGH_RES_TIMERS +static int force_clock_reprogram; + +/* + * After 5 iteration's attempts, we consider that hrtimer_interrupt() + * is hanging, which could happen with something that slows the interrupt + * such as the tracing. Then we force the clock reprogramming for each future + * hrtimer interrupts to avoid infinite loops and use the min_delta_ns + * threshold that we will overwrite. + * The next tick event will be scheduled to 3 times we currently spend on + * hrtimer_interrupt(). This gives a good compromise, the cpus will spend + * 1/4 of their time to process the hrtimer interrupts. This is enough to + * let it running without serious starvation. + */ + +static inline void +hrtimer_interrupt_hanging(struct clock_event_device *dev, + ktime_t try_time) +{ + force_clock_reprogram = 1; + dev->min_delta_ns = (unsigned long)try_time.tv64 * 3; + printk(KERN_WARNING "hrtimer: interrupt too slow, " + "forcing clock min delta to %lu ns\n", dev->min_delta_ns); +} /* * High resolution timer interrupt * Called with interrupts disabled @@ -1180,6 +1203,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; ktime_t expires_next, now; + int nr_retries = 0; int i; BUG_ON(!cpu_base->hres_active); @@ -1187,6 +1211,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) dev->next_event.tv64 = KTIME_MAX; retry: + /* 5 retries is enough to notice a hang */ + if (!(++nr_retries % 5)) + hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now)); + now = ktime_get(); expires_next.tv64 = KTIME_MAX; @@ -1239,7 +1267,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) /* Reprogramming necessary ? */ if (expires_next.tv64 != KTIME_MAX) { - if (tick_program_event(expires_next, 0)) + if (tick_program_event(expires_next, force_clock_reprogram)) goto retry; } } -- cgit v1.1 From efdc64f0c792ea744bcc9203f35b908e66d42f41 Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Mon, 29 Dec 2008 13:35:11 +0800 Subject: genirq: check chip->ack before calling Impact: fix theoretical NULL dereference The generic irq layer doesn't know whether irq_chip has ack routine on some architectures or not. Upon that, before calling chip->ack, we should check that it's not NULL. Signed-off-by: Wang Chen Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6eb3c79..0ad02d7 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -290,7 +290,8 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) desc->chip->mask_ack(irq); else { desc->chip->mask(irq); - desc->chip->ack(irq); + if (desc->chip->ack) + desc->chip->ack(irq); } } @@ -475,7 +476,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ - desc->chip->ack(irq); + if (desc->chip->ack) + desc->chip->ack(irq); desc = irq_remap_to_desc(irq, desc); /* Mark the IRQ currently in progress.*/ -- cgit v1.1 From 4d9842776a23e52ec4c60e0a79f5e1bbe91e463e Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:49 -0500 Subject: sched: cleanup inc/dec_rt_tasks Move some common definitions up to the function prologe to simplify the body logic. Signed-off-by: Gregory Haskins --- kernel/sched_rt.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1bbd990..0a52772 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -550,30 +550,28 @@ static void update_curr_rt(struct rq *rq) static inline void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { - WARN_ON(!rt_prio(rt_se_prio(rt_se))); - rt_rq->rt_nr_running++; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - if (rt_se_prio(rt_se) < rt_rq->highest_prio) { + int prio = rt_se_prio(rt_se); #ifdef CONFIG_SMP - struct rq *rq = rq_of_rt_rq(rt_rq); + struct rq *rq = rq_of_rt_rq(rt_rq); #endif - rt_rq->highest_prio = rt_se_prio(rt_se); + WARN_ON(!rt_prio(prio)); + rt_rq->rt_nr_running++; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + if (prio < rt_rq->highest_prio) { + + rt_rq->highest_prio = prio; #ifdef CONFIG_SMP if (rq->online) - cpupri_set(&rq->rd->cpupri, rq->cpu, - rt_se_prio(rt_se)); + cpupri_set(&rq->rd->cpupri, rq->cpu, prio); #endif } #endif #ifdef CONFIG_SMP - if (rt_se->nr_cpus_allowed > 1) { - struct rq *rq = rq_of_rt_rq(rt_rq); - + if (rt_se->nr_cpus_allowed > 1) rq->rt.rt_nr_migratory++; - } - update_rt_migration(rq_of_rt_rq(rt_rq)); + update_rt_migration(rq); #endif #ifdef CONFIG_RT_GROUP_SCHED if (rt_se_boosted(rt_se)) @@ -590,6 +588,7 @@ static inline void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { #ifdef CONFIG_SMP + struct rq *rq = rq_of_rt_rq(rt_rq); int highest_prio = rt_rq->highest_prio; #endif @@ -611,20 +610,13 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq->highest_prio = MAX_RT_PRIO; #endif #ifdef CONFIG_SMP - if (rt_se->nr_cpus_allowed > 1) { - struct rq *rq = rq_of_rt_rq(rt_rq); + if (rt_se->nr_cpus_allowed > 1) rq->rt.rt_nr_migratory--; - } - if (rt_rq->highest_prio != highest_prio) { - struct rq *rq = rq_of_rt_rq(rt_rq); - - if (rq->online) - cpupri_set(&rq->rd->cpupri, rq->cpu, - rt_rq->highest_prio); - } + if (rq->online && rt_rq->highest_prio != highest_prio) + cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio); - update_rt_migration(rq_of_rt_rq(rt_rq)); + update_rt_migration(rq); #endif /* CONFIG_SMP */ #ifdef CONFIG_RT_GROUP_SCHED if (rt_se_boosted(rt_se)) -- cgit v1.1 From e864c499d9e57805ae1f9e7ea404dd223759cd53 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:49 -0500 Subject: sched: track the next-highest priority on each runqueue We will use this later in the series to reduce the amount of rq-lock contention during a pull operation Signed-off-by: Gregory Haskins --- kernel/sched.c | 8 ++++-- kernel/sched_rt.c | 81 +++++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 67 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 756d981..7729f9a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -463,7 +463,10 @@ struct rt_rq { struct rt_prio_array active; unsigned long rt_nr_running; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - int highest_prio; /* highest queued rt task prio */ + struct { + int curr; /* highest queued rt task prio */ + int next; /* next highest */ + } highest_prio; #endif #ifdef CONFIG_SMP unsigned long rt_nr_migratory; @@ -8169,7 +8172,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) __set_bit(MAX_RT_PRIO, array->bitmap); #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - rt_rq->highest_prio = MAX_RT_PRIO; + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->highest_prio.next = MAX_RT_PRIO; #endif #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 0a52772..ad36d72 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -108,7 +108,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) if (rt_rq->rt_nr_running) { if (rt_se && !on_rt_rq(rt_se)) enqueue_rt_entity(rt_se); - if (rt_rq->highest_prio < curr->prio) + if (rt_rq->highest_prio.curr < curr->prio) resched_task(curr); } } @@ -473,7 +473,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq) - return rt_rq->highest_prio; + return rt_rq->highest_prio.curr; #endif return rt_task_of(rt_se)->prio; @@ -547,6 +547,21 @@ static void update_curr_rt(struct rq *rq) } } +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + +static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu); + +static inline int next_prio(struct rq *rq) +{ + struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu); + + if (next && rt_prio(next->prio)) + return next->prio; + else + return MAX_RT_PRIO; +} +#endif + static inline void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { @@ -558,14 +573,32 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_prio(prio)); rt_rq->rt_nr_running++; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - if (prio < rt_rq->highest_prio) { + if (prio < rt_rq->highest_prio.curr) { - rt_rq->highest_prio = prio; + /* + * If the new task is higher in priority than anything on the + * run-queue, we have a new high that must be published to + * the world. We also know that the previous high becomes + * our next-highest. + */ + rt_rq->highest_prio.next = rt_rq->highest_prio.curr; + rt_rq->highest_prio.curr = prio; #ifdef CONFIG_SMP if (rq->online) cpupri_set(&rq->rd->cpupri, rq->cpu, prio); #endif - } + } else if (prio == rt_rq->highest_prio.curr) + /* + * If the next task is equal in priority to the highest on + * the run-queue, then we implicitly know that the next highest + * task cannot be any lower than current + */ + rt_rq->highest_prio.next = prio; + else if (prio < rt_rq->highest_prio.next) + /* + * Otherwise, we need to recompute next-highest + */ + rt_rq->highest_prio.next = next_prio(rq); #endif #ifdef CONFIG_SMP if (rt_se->nr_cpus_allowed > 1) @@ -589,7 +622,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { #ifdef CONFIG_SMP struct rq *rq = rq_of_rt_rq(rt_rq); - int highest_prio = rt_rq->highest_prio; + int highest_prio = rt_rq->highest_prio.curr; #endif WARN_ON(!rt_prio(rt_se_prio(rt_se))); @@ -597,24 +630,32 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq->rt_nr_running--; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED if (rt_rq->rt_nr_running) { - struct rt_prio_array *array; + int prio = rt_se_prio(rt_se); + + WARN_ON(prio < rt_rq->highest_prio.curr); - WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio); - if (rt_se_prio(rt_se) == rt_rq->highest_prio) { - /* recalculate */ - array = &rt_rq->active; - rt_rq->highest_prio = + /* + * This may have been our highest or next-highest priority + * task and therefore we may have some recomputation to do + */ + if (prio == rt_rq->highest_prio.curr) { + struct rt_prio_array *array = &rt_rq->active; + + rt_rq->highest_prio.curr = sched_find_first_bit(array->bitmap); - } /* otherwise leave rq->highest prio alone */ + } + + if (prio <= rt_rq->highest_prio.next) + rt_rq->highest_prio.next = next_prio(rq); } else - rt_rq->highest_prio = MAX_RT_PRIO; + rt_rq->highest_prio.curr = MAX_RT_PRIO; #endif #ifdef CONFIG_SMP if (rt_se->nr_cpus_allowed > 1) rq->rt.rt_nr_migratory--; - if (rq->online && rt_rq->highest_prio != highest_prio) - cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio); + if (rq->online && rt_rq->highest_prio.curr != highest_prio) + cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); update_rt_migration(rq); #endif /* CONFIG_SMP */ @@ -1064,7 +1105,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) } /* If this rq is still suitable use it. */ - if (lowest_rq->rt.highest_prio > task->prio) + if (lowest_rq->rt.highest_prio.curr > task->prio) break; /* try again */ @@ -1252,7 +1293,7 @@ static int pull_rt_task(struct rq *this_rq) static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ - if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) + if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) pull_rt_task(rq); } @@ -1338,7 +1379,7 @@ static void rq_online_rt(struct rq *rq) __enable_runtime(rq); - cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } /* Assumes rq->lock is held */ @@ -1429,7 +1470,7 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p, * can release the rq lock and p could migrate. * Only reschedule if p is still on the same runqueue. */ - if (p->prio > rq->rt.highest_prio && rq->curr == p) + if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) resched_task(p); #else /* For UP simply resched on drop of prio */ -- cgit v1.1 From a8728944efe23417e38bf22063f06d9d8ee21d59 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:49 -0500 Subject: sched: use highest_prio.curr for pull threshold highest_prio.curr is actually a more accurate way to keep track of the pull_rt_task() threshold since it is always up to date, even if the "next" task migrates during double_lock. Therefore, stop looking at the "next" task object and simply use the highest_prio.curr. Signed-off-by: Gregory Haskins --- kernel/sched_rt.c | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index ad36d72..f8fb3ed 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1207,14 +1207,12 @@ static void push_rt_tasks(struct rq *rq) static int pull_rt_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, ret = 0, cpu; - struct task_struct *p, *next; + struct task_struct *p; struct rq *src_rq; if (likely(!rt_overloaded(this_rq))) return 0; - next = pick_next_task_rt(this_rq); - for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; @@ -1223,17 +1221,9 @@ static int pull_rt_task(struct rq *this_rq) /* * We can potentially drop this_rq's lock in * double_lock_balance, and another CPU could - * steal our next task - hence we must cause - * the caller to recalculate the next task - * in that case: + * alter this_rq */ - if (double_lock_balance(this_rq, src_rq)) { - struct task_struct *old_next = next; - - next = pick_next_task_rt(this_rq); - if (next != old_next) - ret = 1; - } + double_lock_balance(this_rq, src_rq); /* * Are there still pullable RT tasks? @@ -1247,7 +1237,7 @@ static int pull_rt_task(struct rq *this_rq) * Do we have an RT task that preempts * the to-be-scheduled task? */ - if (p && (!next || (p->prio < next->prio))) { + if (p && (p->prio < this_rq->rt.highest_prio.curr)) { WARN_ON(p == src_rq->curr); WARN_ON(!p->se.on_rq); @@ -1257,12 +1247,9 @@ static int pull_rt_task(struct rq *this_rq) * This is just that p is wakeing up and hasn't * had a chance to schedule. We only pull * p if it is lower in priority than the - * current task on the run queue or - * this_rq next task is lower in prio than - * the current task on that rq. + * current task on the run queue */ - if (p->prio < src_rq->curr->prio || - (next && next->prio < src_rq->curr->prio)) + if (p->prio < src_rq->curr->prio) goto skip; ret = 1; @@ -1275,13 +1262,7 @@ static int pull_rt_task(struct rq *this_rq) * case there's an even higher prio task * in another runqueue. (low likelyhood * but possible) - * - * Update next so that we won't pick a task - * on another cpu with a priority lower (or equal) - * than the one we just picked. */ - next = p; - } skip: double_unlock_balance(this_rq, src_rq); -- cgit v1.1 From 74ab8e4f6412c0b2d730fe5de28dc21de8b92c01 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:50 -0500 Subject: sched: use highest_prio.next to optimize pull operations We currently take the rq->lock for every cpu in an overload state during pull_rt_tasks(). However, we now have enough information via the highest_prio.[curr|next] fields to determine if there is any tasks of interest to warrant the overhead of the rq->lock, before we actually take it. So we use this information to reduce lock contention during the pull for the case where the source-rq doesnt have tasks that preempt the current task. Signed-off-by: Gregory Haskins --- kernel/sched_rt.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f8fb3ed..d047f28 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1218,6 +1218,18 @@ static int pull_rt_task(struct rq *this_rq) continue; src_rq = cpu_rq(cpu); + + /* + * Don't bother taking the src_rq->lock if the next highest + * task is known to be lower-priority than our current task. + * This may look racy, but if this value is about to go + * logically higher, the src_rq will push this task away. + * And if its going logically lower, we do not care + */ + if (src_rq->rt.highest_prio.next >= + this_rq->rt.highest_prio.curr) + continue; + /* * We can potentially drop this_rq's lock in * double_lock_balance, and another CPU could -- cgit v1.1 From 777c2f389e463428fd7e2871051a84d7fe84b172 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:50 -0500 Subject: sched: only try to push a task on wakeup if it is migratable There is no sense in wasting time trying to push a task away that cannot move anywhere else. We gain no benefit from trying to push other tasks at this point, so if the task being woken up is non migratable, just skip the whole operation. This reduces overhead in the wakeup path for certain tasks. Signed-off-by: Gregory Haskins --- kernel/sched_rt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d047f28..8d33843 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1314,7 +1314,8 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - rq->rt.overloaded) + rq->rt.overloaded && + p->rt.nr_cpus_allowed > 1) push_rt_tasks(rq); } -- cgit v1.1 From 7e96fa5875d4a9be18d74d3ca7b90518d05bc426 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:50 -0500 Subject: sched: pull only one task during NEWIDLE balancing to limit critical section git-id c4acb2c0669c5c5c9b28e9d02a34b5c67edf7092 attempted to limit newidle critical section length by stopping after at least one task was moved. Further investigation has shown that there are other paths nested further inside the algorithm which still remain that allow long latencies to occur with newidle balancing. This patch applies the same technique inside balance_tasks() to limit the duration of this optional balancing operation. Signed-off-by: Gregory Haskins CC: Nick Piggin --- kernel/sched.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7729f9a..94d9a6c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2984,6 +2984,16 @@ next: pulled++; rem_load_move -= p->se.load.weight; +#ifdef CONFIG_PREEMPT + /* + * NEWIDLE balancing is a source of latency, so preemptible kernels + * will stop after the first task is pulled to minimize the critical + * section. + */ + if (idle == CPU_NEWLY_IDLE) + goto out; +#endif + /* * We only want to steal up to the prescribed amount of weighted load. */ @@ -3030,9 +3040,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, sd, idle, all_pinned, &this_best_prio); class = class->next; +#ifdef CONFIG_PREEMPT + /* + * NEWIDLE balancing is a source of latency, so preemptible + * kernels will stop after the first task is pulled to minimize + * the critical section. + */ if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) break; - +#endif } while (class && max_load_move > total_load_moved); return total_load_moved > 0; -- cgit v1.1 From 8f45e2b516201d1bf681e6026fa5276385def565 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:51 -0500 Subject: sched: make double-lock-balance fair double_lock balance() currently favors logically lower cpus since they often do not have to release their own lock to acquire a second lock. The result is that logically higher cpus can get starved when there is a lot of pressure on the RQs. This can result in higher latencies on higher cpu-ids. This patch makes the algorithm more fair by forcing all paths to have to release both locks before acquiring them again. Since callsites to double_lock_balance already consider it a potential preemption/reschedule point, they have the proper logic to recheck for atomicity violations. Signed-off-by: Gregory Haskins --- kernel/sched.c | 51 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 94d9a6c..8fca364 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1608,21 +1608,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #endif +#ifdef CONFIG_PREEMPT + /* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations. This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below. However, it + * also adds more overhead and therefore may reduce throughput. */ -static int double_lock_balance(struct rq *this_rq, struct rq *busiest) +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + spin_unlock(&this_rq->lock); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry. This favors lower cpu-ids and will + * grant the double lock to lower cpus over higher ids under contention, + * regardless of entry order into the function. + */ +static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __releases(this_rq->lock) __acquires(busiest->lock) __acquires(this_rq->lock) { int ret = 0; - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ - spin_unlock(&this_rq->lock); - BUG_ON(1); - } if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { spin_unlock(&this_rq->lock); @@ -1635,6 +1656,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) return ret; } +#endif /* CONFIG_PREEMPT */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } + + return _double_lock_balance(this_rq, busiest); +} + static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { -- cgit v1.1 From 967fc04671feea4dbf780c9e55a0bc8fcf68a14e Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:52 -0500 Subject: sched: add sched_class->needs_post_schedule() member We currently run class->post_schedule() outside of the rq->lock, which means that we need to test for the need to post_schedule outside of the lock to avoid a forced reacquistion. This is currently not a problem as we only look at rq->rt.overloaded. However, we want to enhance this going forward to look at more state to reduce the need to post_schedule to a bare minimum set. Therefore, we introduce a new member-func called needs_post_schedule() which tests for the post_schedule condtion without actually performing the work. Therefore it is safe to call this function before the rq->lock is released, because we are guaranteed not to drop the lock at an intermediate point (such as what post_schedule() may do). We will use this later in the series [ rostedt: removed paranoid BUG_ON ] Signed-off-by: Gregory Haskins --- kernel/sched.c | 8 +++++++- kernel/sched_rt.c | 24 ++++++++++++++---------- 2 files changed, 21 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8fca364..3acbad8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2621,6 +2621,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) { struct mm_struct *mm = rq->prev_mm; long prev_state; +#ifdef CONFIG_SMP + int post_schedule = 0; + + if (current->sched_class->needs_post_schedule) + post_schedule = current->sched_class->needs_post_schedule(rq); +#endif rq->prev_mm = NULL; @@ -2639,7 +2645,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) finish_arch_switch(prev); finish_lock_switch(rq, prev); #ifdef CONFIG_SMP - if (current->sched_class->post_schedule) + if (post_schedule) current->sched_class->post_schedule(rq); #endif diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8d33843..b0b6ea4 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1290,20 +1290,23 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) pull_rt_task(rq); } +/* + * assumes rq->lock is held + */ +static int needs_post_schedule_rt(struct rq *rq) +{ + return rq->rt.overloaded ? 1 : 0; +} + static void post_schedule_rt(struct rq *rq) { /* - * If we have more than one rt_task queued, then - * see if we can push the other rt_tasks off to other CPUS. - * Note we may release the rq lock, and since - * the lock was owned by prev, we need to release it - * first via finish_lock_switch and then reaquire it here. + * This is only called if needs_post_schedule_rt() indicates that + * we need to push tasks away */ - if (unlikely(rq->rt.overloaded)) { - spin_lock_irq(&rq->lock); - push_rt_tasks(rq); - spin_unlock_irq(&rq->lock); - } + spin_lock_irq(&rq->lock); + push_rt_tasks(rq); + spin_unlock_irq(&rq->lock); } /* @@ -1557,6 +1560,7 @@ static const struct sched_class rt_sched_class = { .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, + .needs_post_schedule = needs_post_schedule_rt, .post_schedule = post_schedule_rt, .task_wake_up = task_wake_up_rt, .switched_from = switched_from_rt, -- cgit v1.1 From 917b627d4d981dc614519d7b34ea31a976b14e12 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:53 -0500 Subject: sched: create "pushable_tasks" list to limit pushing to one attempt The RT scheduler employs a "push/pull" design to actively balance tasks within the system (on a per disjoint cpuset basis). When a task is awoken, it is immediately determined if there are any lower priority cpus which should be preempted. This is opposed to the way normal SCHED_OTHER tasks behave, which will wait for a periodic rebalancing operation to occur before spreading out load. When a particular RQ has more than 1 active RT task, it is said to be in an "overloaded" state. Once this occurs, the system enters the active balancing mode, where it will try to push the task away, or persuade a different cpu to pull it over. The system will stay in this state until the system falls back below the <= 1 queued RT task per RQ. However, the current implementation suffers from a limitation in the push logic. Once overloaded, all tasks (other than current) on the RQ are analyzed on every push operation, even if it was previously unpushable (due to affinity, etc). Whats more, the operation stops at the first task that is unpushable and will not look at items lower in the queue. This causes two problems: 1) We can have the same tasks analyzed over and over again during each push, which extends out the fast path in the scheduler for no gain. Consider a RQ that has dozens of tasks that are bound to a core. Each one of those tasks will be encountered and skipped for each push operation while they are queued. 2) There may be lower-priority tasks under the unpushable task that could have been successfully pushed, but will never be considered until either the unpushable task is cleared, or a pull operation succeeds. The net result is a potential latency source for mid priority tasks. This patch aims to rectify these two conditions by introducing a new priority sorted list: "pushable_tasks". A task is added to the list each time a task is activated or preempted. It is removed from the list any time it is deactivated, made current, or fails to push. This works because a task only needs to be attempted to push once. After an initial failure to push, the other cpus will eventually try to pull the task when the conditions are proper. This also solves the problem that we don't completely analyze all tasks due to encountering an unpushable tasks. Now every task will have a push attempted (when appropriate). This reduces latency both by shorting the critical section of the rq->lock for certain workloads, and by making sure the algorithm considers all eligible tasks in the system. [ rostedt: added a couple more BUG_ONs ] Signed-off-by: Gregory Haskins Acked-by: Steven Rostedt --- kernel/sched.c | 4 ++ kernel/sched_rt.c | 119 +++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 105 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3acbad8..24ab80c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -471,6 +471,7 @@ struct rt_rq { #ifdef CONFIG_SMP unsigned long rt_nr_migratory; int overloaded; + struct plist_head pushable_tasks; #endif int rt_throttled; u64 rt_time; @@ -2481,6 +2482,8 @@ void sched_fork(struct task_struct *p, int clone_flags) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif + plist_node_init(&p->pushable_tasks, MAX_PRIO); + put_cpu(); } @@ -8237,6 +8240,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; + plist_head_init(&rq->rt.pushable_tasks, &rq->lock); #endif rt_rq->rt_time = 0; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index b0b6ea4..fe9da60 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -49,6 +49,24 @@ static void update_rt_migration(struct rq *rq) rq->rt.overloaded = 0; } } + +static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) +{ + plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); + plist_node_init(&p->pushable_tasks, p->prio); + plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); +} + +static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) +{ + plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); +} + +#else + +#define enqueue_pushable_task(rq, p) do { } while (0) +#define dequeue_pushable_task(rq, p) do { } while (0) + #endif /* CONFIG_SMP */ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) @@ -751,6 +769,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) enqueue_rt_entity(rt_se); + if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) + enqueue_pushable_task(rq, p); + inc_cpu_load(rq, p->se.load.weight); } @@ -761,6 +782,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) update_curr_rt(rq); dequeue_rt_entity(rt_se); + dequeue_pushable_task(rq, p); + dec_cpu_load(rq, p->se.load.weight); } @@ -911,7 +934,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, return next; } -static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct *_pick_next_task_rt(struct rq *rq) { struct sched_rt_entity *rt_se; struct task_struct *p; @@ -933,6 +956,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) p = rt_task_of(rt_se); p->se.exec_start = rq->clock; + + return p; +} + +static struct task_struct *pick_next_task_rt(struct rq *rq) +{ + struct task_struct *p = _pick_next_task_rt(rq); + + /* The running task is never eligible for pushing */ + if (p) + dequeue_pushable_task(rq, p); + return p; } @@ -940,6 +975,13 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); p->se.exec_start = 0; + + /* + * The previous task needs to be made eligible for pushing + * if it is still active + */ + if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) + enqueue_pushable_task(rq, p); } #ifdef CONFIG_SMP @@ -1116,6 +1158,31 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } +static inline int has_pushable_tasks(struct rq *rq) +{ + return !plist_head_empty(&rq->rt.pushable_tasks); +} + +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_tasks(rq)) + return NULL; + + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(p->rt.nr_cpus_allowed <= 1); + + BUG_ON(!p->se.on_rq); + BUG_ON(!rt_task(p)); + + return p; +} + /* * If the current CPU has more than one RT task, see if the non * running task can migrate over to a CPU that is running a task @@ -1125,13 +1192,12 @@ static int push_rt_task(struct rq *rq) { struct task_struct *next_task; struct rq *lowest_rq; - int ret = 0; int paranoid = RT_MAX_TRIES; if (!rq->rt.overloaded) return 0; - next_task = pick_next_highest_task_rt(rq, -1); + next_task = pick_next_pushable_task(rq); if (!next_task) return 0; @@ -1163,12 +1229,19 @@ static int push_rt_task(struct rq *rq) * so it is possible that next_task has changed. * If it has, then try again. */ - task = pick_next_highest_task_rt(rq, -1); + task = pick_next_pushable_task(rq); if (unlikely(task != next_task) && task && paranoid--) { put_task_struct(next_task); next_task = task; goto retry; } + + /* + * Once we have failed to push this task, we will not + * try again, since the other cpus will pull from us + * when they are ready + */ + dequeue_pushable_task(rq, next_task); goto out; } @@ -1180,23 +1253,12 @@ static int push_rt_task(struct rq *rq) double_unlock_balance(rq, lowest_rq); - ret = 1; out: put_task_struct(next_task); - return ret; + return 1; } -/* - * TODO: Currently we just use the second highest prio task on - * the queue, and stop when it can't migrate (or there's - * no more RT tasks). There may be a case where a lower - * priority RT task has a different affinity than the - * higher RT task. In this case the lower RT task could - * possibly be able to migrate where as the higher priority - * RT task could not. We currently ignore this issue. - * Enhancements are welcome! - */ static void push_rt_tasks(struct rq *rq) { /* push_rt_task will return true if it moved an RT */ @@ -1295,7 +1357,7 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) */ static int needs_post_schedule_rt(struct rq *rq) { - return rq->rt.overloaded ? 1 : 0; + return has_pushable_tasks(rq); } static void post_schedule_rt(struct rq *rq) @@ -1317,7 +1379,7 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - rq->rt.overloaded && + has_pushable_tasks(rq) && p->rt.nr_cpus_allowed > 1) push_rt_tasks(rq); } @@ -1354,6 +1416,24 @@ static void set_cpus_allowed_rt(struct task_struct *p, if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { struct rq *rq = task_rq(p); + if (!task_current(rq, p)) { + /* + * Make sure we dequeue this task from the pushable list + * before going further. It will either remain off of + * the list because we are no longer pushable, or it + * will be requeued. + */ + if (p->rt.nr_cpus_allowed > 1) + dequeue_pushable_task(rq, p); + + /* + * Requeue if our weight is changing and still > 1 + */ + if (weight > 1) + enqueue_pushable_task(rq, p); + + } + if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { rq->rt.rt_nr_migratory++; } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { @@ -1538,6 +1618,9 @@ static void set_curr_task_rt(struct rq *rq) struct task_struct *p = rq->curr; p->se.exec_start = rq->clock; + + /* The running task is never eligible for pushing */ + dequeue_pushable_task(rq, p); } static const struct sched_class rt_sched_class = { -- cgit v1.1 From 1563513d34ed4b12ef32bc2adde4a53ce05701a1 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:53 -0500 Subject: RT: fix push_rt_task() to handle dequeue_pushable properly A panic was discovered by Chirag Jog where a BUG_ON sanity check in the new "pushable_task" logic would trigger a panic under certain circumstances: http://lkml.org/lkml/2008/9/25/189 Gilles Carry discovered that the root cause was attributed to the pushable_tasks list getting corrupted in the push_rt_task logic. This was the result of a dropped rq lock in double_lock_balance allowing a task in the process of being pushed to potentially migrate away, and thus corrupt the pushable_tasks() list. I traced back the problem as introduced by the pushable_tasks patch that went in recently. There is a "retry" path in push_rt_task() that actually had a compound conditional to decide whether to retry or exit. I missed the meaning behind the rationale for the virtual "if(!task) goto out;" portion of the compound statement and thus did not handle it properly. The new pushable_tasks logic actually creates three distinct conditions: 1) an untouched and unpushable task should be dequeued 2) a migrated task where more pushable tasks remain should be retried 3) a migrated task where no more pushable tasks exist should exit The original logic mushed (1) and (3) together, resulting in the system dequeuing a migrated task (against an unlocked foreign run-queue nonetheless). To fix this, we get rid of the notion of "paranoid" and we support the three unique conditions properly. The paranoid feature is no longer relevant with the new pushable logic (since pushable naturally limits the loop) anyway, so lets just remove it. Reported-By: Chirag Jog Found-by: Gilles Carry Signed-off-by: Gregory Haskins --- kernel/sched_rt.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index fe9da60..64a8f0a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1192,7 +1192,6 @@ static int push_rt_task(struct rq *rq) { struct task_struct *next_task; struct rq *lowest_rq; - int paranoid = RT_MAX_TRIES; if (!rq->rt.overloaded) return 0; @@ -1226,23 +1225,34 @@ static int push_rt_task(struct rq *rq) struct task_struct *task; /* * find lock_lowest_rq releases rq->lock - * so it is possible that next_task has changed. - * If it has, then try again. + * so it is possible that next_task has migrated. + * + * We need to make sure that the task is still on the same + * run-queue and is also still the next task eligible for + * pushing. */ task = pick_next_pushable_task(rq); - if (unlikely(task != next_task) && task && paranoid--) { - put_task_struct(next_task); - next_task = task; - goto retry; + if (task_cpu(next_task) == rq->cpu && task == next_task) { + /* + * If we get here, the task hasnt moved at all, but + * it has failed to push. We will not try again, + * since the other cpus will pull from us when they + * are ready. + */ + dequeue_pushable_task(rq, next_task); + goto out; } + if (!task) + /* No more tasks, just exit */ + goto out; + /* - * Once we have failed to push this task, we will not - * try again, since the other cpus will pull from us - * when they are ready + * Something has shifted, try again. */ - dequeue_pushable_task(rq, next_task); - goto out; + put_task_struct(next_task); + next_task = task; + goto retry; } deactivate_task(rq, next_task, 0); -- cgit v1.1 From 5762ba1873b0bb9faa631aaa02f533c2b9837f82 Mon Sep 17 00:00:00 2001 From: Sebastien Dugue Date: Mon, 1 Dec 2008 14:09:07 +0100 Subject: hrtimers: allow the hot-unplugging of all cpus Impact: fix CPU hotplug hang on Power6 testbox On architectures that support offlining all cpus (at least powerpc/pseries), hot-unpluging the tick_do_timer_cpu can result in a system hang. This comes from the fact that if the cpu going down happens to be the cpu doing the tick, then as the tick_do_timer_cpu handover happens after the cpu is dead (via the CPU_DEAD notification), we're left without ticks, jiffies are frozen and any task relying on timers (msleep, ...) is stuck. That's particularly the case for the cpu looping in __cpu_die() waiting for the dying cpu to be dead. This patch addresses this by having the tick_do_timer_cpu handover happen earlier during the CPU_DYING notification. For this, a new clockevent notification type is introduced (CLOCK_EVT_NOTIFY_CPU_DYING) which is triggered in hrtimer_cpu_notify(). Signed-off-by: Sebastien Dugue Cc: Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 4 ++++ kernel/time/tick-common.c | 26 +++++++++++++++++++------- 2 files changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index c2a69b8..61cb933 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1609,6 +1609,10 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, break; #ifdef CONFIG_HOTPLUG_CPU + case CPU_DYING: + case CPU_DYING_FROZEN: + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu); + break; case CPU_DEAD: case CPU_DEAD_FROZEN: { diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index df12434..457d281 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -274,6 +274,21 @@ out_bc: } /* + * Transfer the do_timer job away from a dying cpu. + * + * Called with interrupts disabled. + */ +static void tick_handover_do_timer(int *cpup) +{ + if (*cpup == tick_do_timer_cpu) { + int cpu = first_cpu(cpu_online_map); + + tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : + TICK_DO_TIMER_NONE; + } +} + +/* * Shutdown an event device on a given cpu: * * This is called on a life CPU, when a CPU is dead. So we cannot @@ -297,13 +312,6 @@ static void tick_shutdown(unsigned int *cpup) clockevents_exchange_device(dev, NULL); td->evtdev = NULL; } - /* Transfer the do_timer job away from this cpu */ - if (*cpup == tick_do_timer_cpu) { - int cpu = first_cpu(cpu_online_map); - - tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : - TICK_DO_TIMER_NONE; - } spin_unlock_irqrestore(&tick_device_lock, flags); } @@ -357,6 +365,10 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason, tick_broadcast_oneshot_control(reason); break; + case CLOCK_EVT_NOTIFY_CPU_DYING: + tick_handover_do_timer(dev); + break; + case CLOCK_EVT_NOTIFY_CPU_DEAD: tick_shutdown_broadcast_oneshot(dev); tick_shutdown_broadcast(dev); -- cgit v1.1 From d7e51e66899f95dabc89b4d4c6674a6e50fa37fc Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 7 Jan 2009 15:03:13 -0800 Subject: sparseirq: make some func to be used with genirq Impact: clean up sparseirq fallout on random.c Ingo suggested to change some ifdef from SPARSE_IRQ to GENERIC_HARDIRQS so we could some #ifdef later if all arch support genirq Signed-off-by: Yinghai Lu Acked-by: Matt Mackall Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c20db0b..48299a8 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -213,6 +213,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { } }; +static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; int __init early_irq_init(void) { struct irq_desc *desc; @@ -222,8 +223,10 @@ int __init early_irq_init(void) desc = irq_desc; count = ARRAY_SIZE(irq_desc); - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { desc[i].irq = i; + desc[i].kstat_irqs = kstat_irqs_all[i]; + } return arch_early_irq_init(); } @@ -451,12 +454,10 @@ void early_init_irq_lock_class(void) } } -#ifdef CONFIG_SPARSE_IRQ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); return desc ? desc->kstat_irqs[cpu] : 0; } -#endif EXPORT_SYMBOL(kstat_irqs_cpu); -- cgit v1.1 From 7f7ace0cda64c99599c23785f8979a072e118058 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 21:58:08 -0800 Subject: cpumask: update irq_desc to use cpumask_var_t Impact: reduce memory usage, use new cpumask API. Replace the affinity and pending_masks with cpumask_var_t's. This adds to the significant size reduction done with the SPARSE_IRQS changes. The added functions (init_alloc_desc_masks & init_copy_desc_masks) are in the include file so they can be inlined (and optimized out for the !CONFIG_CPUMASKS_OFFSTACK case.) [Naming chosen to be consistent with the other init*irq functions, as well as the backwards arg declaration of "from, to" instead of the more common "to, from" standard.] Includes a slight change to the declaration of struct irq_desc to embed the pending_mask within ifdef(CONFIG_SMP) to be consistent with other references, and some small changes to Xen. Tested: sparse/non-sparse/cpumask_offstack/non-cpumask_offstack/nonuma/nosmp on x86_64 Signed-off-by: Mike Travis Cc: Chris Wright Cc: Jeremy Fitzhardinge Cc: KOSAKI Motohiro Cc: Venkatesh Pallipadi Cc: virtualization@lists.osdl.org Cc: xen-devel@lists.xensource.com Cc: Yinghai Lu --- kernel/irq/chip.c | 5 ++++- kernel/irq/handle.c | 26 ++++++++++++++------------ kernel/irq/manage.c | 12 ++++++------ kernel/irq/migration.c | 12 ++++++------ kernel/irq/numa_migrate.c | 12 +++++++++++- kernel/irq/proc.c | 4 ++-- 6 files changed, 43 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f63c706..c248eba 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -46,7 +46,10 @@ void dynamic_irq_init(unsigned int irq) desc->irq_count = 0; desc->irqs_unhandled = 0; #ifdef CONFIG_SMP - cpumask_setall(&desc->affinity); + cpumask_setall(desc->affinity); +#ifdef CONFIG_GENERIC_PENDING_IRQ + cpumask_clear(desc->pending_mask); +#endif #endif spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c20db0b..b8fa135 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -64,9 +64,6 @@ static struct irq_desc irq_desc_init = { .handle_irq = handle_bad_irq, .depth = 1, .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -#ifdef CONFIG_SMP - .affinity = CPU_MASK_ALL -#endif }; void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) @@ -88,6 +85,8 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) { + int node = cpu_to_node(cpu); + memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); spin_lock_init(&desc->lock); @@ -101,6 +100,10 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) printk(KERN_ERR "can not alloc kstat_irqs\n"); BUG_ON(1); } + if (!init_alloc_desc_masks(desc, node, false)) { + printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); + BUG_ON(1); + } arch_init_chip_data(desc, cpu); } @@ -119,9 +122,6 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm .handle_irq = handle_bad_irq, .depth = 1, .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -#ifdef CONFIG_SMP - .affinity = CPU_MASK_ALL -#endif } }; @@ -141,7 +141,7 @@ int __init early_irq_init(void) desc[i].irq = i; desc[i].kstat_irqs = kstat_irqs_legacy[i]; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - + init_alloc_desc_masks(&desc[i], 0, true); irq_desc_ptrs[i] = desc + i; } @@ -188,6 +188,10 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) printk(KERN_ERR "can not alloc irq_desc\n"); BUG_ON(1); } + if (!init_alloc_desc_masks(desc, node, false)) { + printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); + BUG_ON(1); + } init_one_irq_desc(irq, desc, cpu); irq_desc_ptrs[irq] = desc; @@ -207,9 +211,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { .handle_irq = handle_bad_irq, .depth = 1, .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), -#ifdef CONFIG_SMP - .affinity = CPU_MASK_ALL -#endif } }; @@ -222,9 +223,10 @@ int __init early_irq_init(void) desc = irq_desc; count = ARRAY_SIZE(irq_desc); - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { desc[i].irq = i; - + init_alloc_desc_masks(&desc[i], 0, true); + } return arch_early_irq_init(); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index cd0cd8d..b98739a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -98,14 +98,14 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { - cpumask_copy(&desc->affinity, cpumask); + cpumask_copy(desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); } else { desc->status |= IRQ_MOVE_PENDING; - cpumask_copy(&desc->pending_mask, cpumask); + cpumask_copy(desc->pending_mask, cpumask); } #else - cpumask_copy(&desc->affinity, cpumask); + cpumask_copy(desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); #endif desc->status |= IRQ_AFFINITY_SET; @@ -127,16 +127,16 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) * one of the targets is online. */ if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { - if (cpumask_any_and(&desc->affinity, cpu_online_mask) + if (cpumask_any_and(desc->affinity, cpu_online_mask) < nr_cpu_ids) goto set_affinity; else desc->status &= ~IRQ_AFFINITY_SET; } - cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity); + cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); set_affinity: - desc->chip->set_affinity(irq, &desc->affinity); + desc->chip->set_affinity(irq, desc->affinity); return 0; } diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index bd72329..e05ad9b 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -18,7 +18,7 @@ void move_masked_irq(int irq) desc->status &= ~IRQ_MOVE_PENDING; - if (unlikely(cpumask_empty(&desc->pending_mask))) + if (unlikely(cpumask_empty(desc->pending_mask))) return; if (!desc->chip->set_affinity) @@ -38,13 +38,13 @@ void move_masked_irq(int irq) * For correct operation this depends on the caller * masking the irqs. */ - if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask) + if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)) { - cpumask_and(&desc->affinity, - &desc->pending_mask, cpu_online_mask); - desc->chip->set_affinity(irq, &desc->affinity); + cpumask_and(desc->affinity, + desc->pending_mask, cpu_online_mask); + desc->chip->set_affinity(irq, desc->affinity); } - cpumask_clear(&desc->pending_mask); + cpumask_clear(desc->pending_mask); } void move_native_irq(int irq) diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index ecf765c..f001a4e 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -46,6 +46,7 @@ static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, desc->cpu = cpu; lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); + init_copy_desc_masks(old_desc, desc); arch_init_copy_chip_data(old_desc, desc, cpu); } @@ -76,11 +77,20 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, node = cpu_to_node(cpu); desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); if (!desc) { - printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq); + printk(KERN_ERR "irq %d: can not get new irq_desc " + "for migration.\n", irq); /* still use old one */ desc = old_desc; goto out_unlock; } + if (!init_alloc_desc_masks(desc, node, false)) { + printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " + "for migration.\n", irq); + /* still use old one */ + kfree(desc); + desc = old_desc; + goto out_unlock; + } init_copy_one_irq_desc(irq, old_desc, desc, cpu); irq_desc_ptrs[irq] = desc; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index aae3f74..692363d 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -20,11 +20,11 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - const struct cpumask *mask = &desc->affinity; + const struct cpumask *mask = desc->affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PENDING) - mask = &desc->pending_mask; + mask = desc->pending_mask; #endif seq_cpumask(m, mask); seq_putc(m, '\n'); -- cgit v1.1 From 802bf931f2688ad125b73db597ce63cc842fb27a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 21:58:09 -0800 Subject: cpumask: fix bug in use cpumask_var_t in irq_desc Impact: fix bug where new irq_desc uses old cpumask pointers which are freed. As Yinghai pointed out, init_copy_one_irq_desc() copies the old desc to the new desc overwriting the cpumask pointers. Since the old_desc and the cpumask pointers are freed, then memory corruption will occur if these old pointers are used. Move the allocation of these pointers to after the copy. Signed-off-by: Mike Travis Cc: Yinghai Lu --- kernel/irq/handle.c | 8 +------- kernel/irq/numa_migrate.c | 13 ++++++++----- 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index b8fa135..f01c0a3 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -85,8 +85,6 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) { - int node = cpu_to_node(cpu); - memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); spin_lock_init(&desc->lock); @@ -100,7 +98,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) printk(KERN_ERR "can not alloc kstat_irqs\n"); BUG_ON(1); } - if (!init_alloc_desc_masks(desc, node, false)) { + if (!init_alloc_desc_masks(desc, cpu, false)) { printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); BUG_ON(1); } @@ -188,10 +186,6 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) printk(KERN_ERR "can not alloc irq_desc\n"); BUG_ON(1); } - if (!init_alloc_desc_masks(desc, node, false)) { - printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); - BUG_ON(1); - } init_one_irq_desc(irq, desc, cpu); irq_desc_ptrs[irq] = desc; diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index f001a4e..666260e 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -38,16 +38,22 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) old_desc->kstat_irqs = NULL; } -static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, +static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, struct irq_desc *desc, int cpu) { memcpy(desc, old_desc, sizeof(struct irq_desc)); + if (!init_alloc_desc_masks(desc, cpu, false)) { + printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " + "for migration.\n", irq); + return false; + } spin_lock_init(&desc->lock); desc->cpu = cpu; lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); init_copy_desc_masks(old_desc, desc); arch_init_copy_chip_data(old_desc, desc, cpu); + return true; } static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) @@ -83,15 +89,12 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, desc = old_desc; goto out_unlock; } - if (!init_alloc_desc_masks(desc, node, false)) { - printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " - "for migration.\n", irq); + if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { /* still use old one */ kfree(desc); desc = old_desc; goto out_unlock; } - init_copy_one_irq_desc(irq, old_desc, desc, cpu); irq_desc_ptrs[irq] = desc; -- cgit v1.1 From d38b223c86db3162dc85b5a1997ac8a210e1660b Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 21:58:11 -0800 Subject: cpumask: reduce stack usage in find_lowest_rq Impact: reduce stack usage, cleanup Use a cpumask_var_t in find_lowest_rq() and clean up other old cpumask_t calls. Signed-off-by: Mike Travis --- kernel/sched_rt.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 954e1a8..da932f4 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -960,16 +960,17 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); -static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) +static inline int pick_optimal_cpu(int this_cpu, + const struct cpumask *mask) { int first; /* "this_cpu" is cheaper to preempt than a remote processor */ - if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) + if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask)) return this_cpu; - first = first_cpu(*mask); - if (first != NR_CPUS) + first = cpumask_first(mask); + if (first < nr_cpu_ids) return first; return -1; @@ -981,6 +982,7 @@ static int find_lowest_rq(struct task_struct *task) struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); + cpumask_var_t domain_mask; if (task->rt.nr_cpus_allowed == 1) return -1; /* No other targets possible */ @@ -1013,19 +1015,25 @@ static int find_lowest_rq(struct task_struct *task) if (this_cpu == cpu) this_cpu = -1; /* Skip this_cpu opt if the same */ - for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_AFFINE) { - cpumask_t domain_mask; - int best_cpu; + if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) { + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; - cpumask_and(&domain_mask, sched_domain_span(sd), - lowest_mask); + cpumask_and(domain_mask, + sched_domain_span(sd), + lowest_mask); - best_cpu = pick_optimal_cpu(this_cpu, - &domain_mask); - if (best_cpu != -1) - return best_cpu; + best_cpu = pick_optimal_cpu(this_cpu, + domain_mask); + + if (best_cpu != -1) { + free_cpumask_var(domain_mask); + return best_cpu; + } + } } + free_cpumask_var(domain_mask); } /* -- cgit v1.1 From 9594949b060efe86ecaa1a66839232a3b9800bc9 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 22:24:06 -0800 Subject: irq: change references from NR_IRQS to nr_irqs Impact: preparation, cleanup, add KERN_INFO printk Modify references from NR_IRQS to nr_irqs as the later will become variable-sized based on nr_cpu_ids when CONFIG_SPARSE_IRQS=y. Signed-off-by: Mike Travis --- kernel/irq/handle.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index f01c0a3..790c5fa 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -132,6 +132,8 @@ int __init early_irq_init(void) int legacy_count; int i; + printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); + desc = irq_desc_legacy; legacy_count = ARRAY_SIZE(irq_desc_legacy); @@ -143,7 +145,7 @@ int __init early_irq_init(void) irq_desc_ptrs[i] = desc + i; } - for (i = legacy_count; i < NR_IRQS; i++) + for (i = legacy_count; i < nr_irqs; i++) irq_desc_ptrs[i] = NULL; return arch_early_irq_init(); @@ -151,7 +153,7 @@ int __init early_irq_init(void) struct irq_desc *irq_to_desc(unsigned int irq) { - return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL; + return (irq < nr_irqs) ? irq_desc_ptrs[irq] : NULL; } struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) @@ -160,9 +162,9 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) unsigned long flags; int node; - if (irq >= NR_IRQS) { - printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n", - irq, NR_IRQS); + if (irq >= nr_irqs) { + printk(KERN_WARNING "irq >= nr_irqs in irq_to_desc_alloc: %d %d\n", + irq, nr_irqs); WARN_ON(1); return NULL; } @@ -214,6 +216,8 @@ int __init early_irq_init(void) int count; int i; + printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); + desc = irq_desc; count = ARRAY_SIZE(irq_desc); -- cgit v1.1 From e2f4d06545ec1f29b0e838ee34cbf3500ea5b9a4 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 22:24:06 -0800 Subject: irq: use WARN() instead of WARN_ON(). Impact: cleanup WARN msg. Ingo requested: > While at it, could you please also convert this to a WARN() construct > instead? (in a separate commit) ... and it shall be done. ;-) Signed-off-by: Mike Travis --- kernel/irq/handle.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 790c5fa..fd1ef16 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -163,9 +163,8 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) int node; if (irq >= nr_irqs) { - printk(KERN_WARNING "irq >= nr_irqs in irq_to_desc_alloc: %d %d\n", - irq, nr_irqs); - WARN_ON(1); + WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", + irq, nr_irqs); return NULL; } -- cgit v1.1 From 0fa0ebbf15addc1be8f73325d809c8547a9de304 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 22:24:06 -0800 Subject: irq: allocate irq_desc_ptrs array based on nr_irqs Impact: allocate irq_desc_ptrs in preparation for making it variable-sized. This addresses this memory usage bump when NR_CPUS bumped from 128 to 4096: 34816 +229376 264192 +658% irq_desc_ptrs(.data.read_mostly) The patch is split into two parts, the first simply allocates the irq_desc_ptrs array. Then next will deal with making it variable. This is only when CONFIG_SPARSE_IRQS=y. Signed-off-by: Mike Travis --- kernel/irq/handle.c | 11 +++++++++-- kernel/irq/internals.h | 7 +++++++ 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index fd1ef16..d0b8f7e 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "internals.h" @@ -110,7 +111,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) */ DEFINE_SPINLOCK(sparse_irq_lock); -struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly; +struct irq_desc **irq_desc_ptrs __read_mostly; static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { [0 ... NR_IRQS_LEGACY-1] = { @@ -137,6 +138,9 @@ int __init early_irq_init(void) desc = irq_desc_legacy; legacy_count = ARRAY_SIZE(irq_desc_legacy); + /* allocate irq_desc_ptrs array based on nr_irqs */ + irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); + for (i = 0; i < legacy_count; i++) { desc[i].irq = i; desc[i].kstat_irqs = kstat_irqs_legacy[i]; @@ -153,7 +157,10 @@ int __init early_irq_init(void) struct irq_desc *irq_to_desc(unsigned int irq) { - return (irq < nr_irqs) ? irq_desc_ptrs[irq] : NULL; + if (irq_desc_ptrs && irq < nr_irqs) + return irq_desc_ptrs[irq]; + + return NULL; } struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index e6d0a43..40416a8 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -16,7 +16,14 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, extern struct lock_class_key irq_desc_lock_class; extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); extern spinlock_t sparse_irq_lock; + +#ifdef CONFIG_SPARSE_IRQ +/* irq_desc_ptrs allocated at boot time */ +extern struct irq_desc **irq_desc_ptrs; +#else +/* irq_desc_ptrs is a fixed size array */ extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; +#endif #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); -- cgit v1.1 From 9332fccdedf8e09448f3b69b624211ae879f6c45 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 22:24:07 -0800 Subject: irq: initialize nr_irqs based on nr_cpu_ids Impact: Reduce memory usage. This is the second half of the changes to make the irq_desc_ptrs be variable sized based on nr_cpu_ids. This is done by adding a new "max_nr_irqs" macro to irq_vectors.h (and a dummy in irqnr.h) to return a max NR_IRQS value based on NR_CPUS or nr_cpu_ids. This necessitated moving the define of MAX_IO_APICS to a separate file (asm/apicnum.h) so it could be included without the baggage of the other asm/apicdef.h declarations. Signed-off-by: Mike Travis --- kernel/irq/handle.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d0b8f7e..ebba7a1 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -133,6 +133,9 @@ int __init early_irq_init(void) int legacy_count; int i; + /* initialize nr_irqs based on nr_cpu_ids */ + nr_irqs = max_nr_irqs(nr_cpu_ids); + printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); desc = irq_desc_legacy; -- cgit v1.1 From 542d865bbed4ce1f050f586e53cf1cfadda93766 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 22:24:07 -0800 Subject: kstat: modify kstat_irqs_legacy to be variable sized Impact: reduce memory usage. Allocate kstat_irqs_legacy based on nr_cpu_ids to deal with this memory usage bump when NR_CPUS bumped from 128 to 4096: 8192 +253952 262144 +3100% kstat_irqs_legacy(.bss) This is only when CONFIG_SPARSE_IRQS=y. Signed-off-by: Mike Travis --- kernel/irq/handle.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index ebba7a1..b39f32a 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -124,8 +124,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm } }; -/* FIXME: use bootmem alloc ...*/ -static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS]; +static unsigned int *kstat_irqs_legacy; int __init early_irq_init(void) { @@ -144,9 +143,14 @@ int __init early_irq_init(void) /* allocate irq_desc_ptrs array based on nr_irqs */ irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); + /* allocate based on nr_cpu_ids */ + /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */ + kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids * + sizeof(int)); + for (i = 0; i < legacy_count; i++) { desc[i].irq = i; - desc[i].kstat_irqs = kstat_irqs_legacy[i]; + desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); init_alloc_desc_masks(&desc[i], 0, true); irq_desc_ptrs[i] = desc + i; -- cgit v1.1 From 92296c6d6e908c35fca287a21af27be814af9c75 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sun, 11 Jan 2009 09:22:58 -0800 Subject: cpumask, irq: non-x86 build failures Ingo Molnar wrote: > All non-x86 architectures fail to build: > > In file included from /home/mingo/tip/include/linux/random.h:11, > from /home/mingo/tip/include/linux/stackprotector.h:6, > from /home/mingo/tip/init/main.c:17: > /home/mingo/tip/include/linux/irqnr.h:26:63: error: asm/irq_vectors.h: No such file or directory Do not include asm/irq_vectors.h in generic code - it's not available on all architectures. Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index b39f32a..04d3e46 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -58,6 +58,11 @@ int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); #ifdef CONFIG_SPARSE_IRQ + +#ifndef max_nr_irqs +#define max_nr_irqs(nr_cpus) NR_IRQS +#endif + static struct irq_desc irq_desc_init = { .irq = -1, .status = IRQ_DISABLED, -- cgit v1.1 From 4a046d1754ee6ebb6f399696805ed61ea0444d4c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 12 Jan 2009 17:39:24 -0800 Subject: x86: arch_probe_nr_irqs Impact: save RAM with large NR_CPUS, get smaller nr_irqs Signed-off-by: Yinghai Lu Signed-off-by: Mike Travis --- kernel/irq/handle.c | 9 ++------- kernel/softirq.c | 5 +++++ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 04d3e46..375d68c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -59,10 +59,6 @@ EXPORT_SYMBOL_GPL(nr_irqs); #ifdef CONFIG_SPARSE_IRQ -#ifndef max_nr_irqs -#define max_nr_irqs(nr_cpus) NR_IRQS -#endif - static struct irq_desc irq_desc_init = { .irq = -1, .status = IRQ_DISABLED, @@ -137,9 +133,8 @@ int __init early_irq_init(void) int legacy_count; int i; - /* initialize nr_irqs based on nr_cpu_ids */ - nr_irqs = max_nr_irqs(nr_cpu_ids); - + /* initialize nr_irqs based on nr_cpu_ids */ + arch_probe_nr_irqs(); printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); desc = irq_desc_legacy; diff --git a/kernel/softirq.c b/kernel/softirq.c index bdbe9de..0365b48 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -795,6 +795,11 @@ int __init __weak early_irq_init(void) return 0; } +int __init __weak arch_probe_nr_irqs(void) +{ + return 0; +} + int __init __weak arch_early_irq_init(void) { return 0; -- cgit v1.1 From b07430ac37103218b5c1e542490a1b98e6deb3d6 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 14 Jan 2009 08:55:39 -0500 Subject: sched: de CPP-ify the scheduler code Signed-off-by: Gregory Haskins --- kernel/sched_rt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 18c7b5b..4eda5f7 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -64,8 +64,10 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) #else -#define enqueue_pushable_task(rq, p) do { } while (0) -#define dequeue_pushable_task(rq, p) do { } while (0) +static inline +void enqueue_pushable_task(struct rq *rq, struct task_struct *p) {} +static inline +void dequeue_pushable_task(struct rq *rq, struct task_struct *p) {} #endif /* CONFIG_SMP */ -- cgit v1.1 From 398a153b16b09a68739928d4502455db9725ac86 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 14 Jan 2009 09:10:04 -0500 Subject: sched: fix build error in kernel/sched_rt.c when RT_GROUP_SCHED && !SMP Ingo found a build error in the scheduler when RT_GROUP_SCHED was enabled, but SMP was not. This patch rearranges the code such that it is a little more streamlined and compiles under all permutations of SMP, UP and RT_GROUP_SCHED. It was boot tested on my 4-way x86_64 and it still passes preempt-test. Signed-off-by: Gregory Haskins --- kernel/sched.c | 4 + kernel/sched_rt.c | 264 +++++++++++++++++++++++++++++++++++------------------- 2 files changed, 174 insertions(+), 94 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index dd1a146..2b703f1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -466,7 +466,9 @@ struct rt_rq { #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP int next; /* next highest */ +#endif } highest_prio; #endif #ifdef CONFIG_SMP @@ -8267,8 +8269,10 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED rt_rq->highest_prio.curr = MAX_RT_PRIO; +#ifdef CONFIG_SMP rt_rq->highest_prio.next = MAX_RT_PRIO; #endif +#endif #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 4eda5f7..4230b15 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -3,6 +3,40 @@ * policies) */ +static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) +{ + return container_of(rt_se, struct task_struct, rt); +} + +#ifdef CONFIG_RT_GROUP_SCHED + +static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) +{ + return rt_rq->rq; +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ + return rt_se->rt_rq; +} + +#else /* CONFIG_RT_GROUP_SCHED */ + +static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) +{ + return container_of(rt_rq, struct rq, rt); +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ + struct task_struct *p = rt_task_of(rt_se); + struct rq *rq = task_rq(p); + + return &rq->rt; +} + +#endif /* CONFIG_RT_GROUP_SCHED */ + #ifdef CONFIG_SMP static inline int rt_overloaded(struct rq *rq) @@ -37,19 +71,35 @@ static inline void rt_clear_overload(struct rq *rq) cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); } -static void update_rt_migration(struct rq *rq) +static void update_rt_migration(struct rt_rq *rt_rq) { - if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) { - if (!rq->rt.overloaded) { - rt_set_overload(rq); - rq->rt.overloaded = 1; + if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) { + if (!rt_rq->overloaded) { + rt_set_overload(rq_of_rt_rq(rt_rq)); + rt_rq->overloaded = 1; } - } else if (rq->rt.overloaded) { - rt_clear_overload(rq); - rq->rt.overloaded = 0; + } else if (rt_rq->overloaded) { + rt_clear_overload(rq_of_rt_rq(rt_rq)); + rt_rq->overloaded = 0; } } +static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + if (rt_se->nr_cpus_allowed > 1) + rt_rq->rt_nr_migratory++; + + update_rt_migration(rt_rq); +} + +static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + if (rt_se->nr_cpus_allowed > 1) + rt_rq->rt_nr_migratory--; + + update_rt_migration(rt_rq); +} + static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) { plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); @@ -68,14 +118,13 @@ static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) {} static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) {} +static inline +void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} +static inline +void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} #endif /* CONFIG_SMP */ -static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) -{ - return container_of(rt_se, struct task_struct, rt); -} - static inline int on_rt_rq(struct sched_rt_entity *rt_se) { return !list_empty(&rt_se->run_list); @@ -99,16 +148,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) #define for_each_leaf_rt_rq(rt_rq, rq) \ list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) -static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) -{ - return rt_rq->rq; -} - -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) -{ - return rt_se->rt_rq; -} - #define for_each_sched_rt_entity(rt_se) \ for (; rt_se; rt_se = rt_se->parent) @@ -196,19 +235,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) #define for_each_leaf_rt_rq(rt_rq, rq) \ for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) -static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) -{ - return container_of(rt_rq, struct rq, rt); -} - -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) -{ - struct task_struct *p = rt_task_of(rt_se); - struct rq *rq = task_rq(p); - - return &rq->rt; -} - #define for_each_sched_rt_entity(rt_se) \ for (; rt_se; rt_se = NULL) @@ -567,7 +593,7 @@ static void update_curr_rt(struct rq *rq) } } -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +#if defined CONFIG_SMP static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu); @@ -580,33 +606,24 @@ static inline int next_prio(struct rq *rq) else return MAX_RT_PRIO; } -#endif -static inline -void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +static void +inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { - int prio = rt_se_prio(rt_se); -#ifdef CONFIG_SMP struct rq *rq = rq_of_rt_rq(rt_rq); -#endif - WARN_ON(!rt_prio(prio)); - rt_rq->rt_nr_running++; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - if (prio < rt_rq->highest_prio.curr) { + if (prio < prev_prio) { /* * If the new task is higher in priority than anything on the - * run-queue, we have a new high that must be published to - * the world. We also know that the previous high becomes - * our next-highest. + * run-queue, we know that the previous high becomes our + * next-highest. */ - rt_rq->highest_prio.next = rt_rq->highest_prio.curr; - rt_rq->highest_prio.curr = prio; -#ifdef CONFIG_SMP + rt_rq->highest_prio.next = prev_prio; + if (rq->online) cpupri_set(&rq->rd->cpupri, rq->cpu, prio); -#endif + } else if (prio == rt_rq->highest_prio.curr) /* * If the next task is equal in priority to the highest on @@ -619,72 +636,131 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) * Otherwise, we need to recompute next-highest */ rt_rq->highest_prio.next = next_prio(rq); -#endif -#ifdef CONFIG_SMP - if (rt_se->nr_cpus_allowed > 1) - rq->rt.rt_nr_migratory++; +} - update_rt_migration(rq); -#endif -#ifdef CONFIG_RT_GROUP_SCHED - if (rt_se_boosted(rt_se)) - rt_rq->rt_nr_boosted++; +static void +dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); - if (rt_rq->tg) - start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); -#else - start_rt_bandwidth(&def_rt_bandwidth); -#endif + if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next)) + rt_rq->highest_prio.next = next_prio(rq); + + if (rq->online && rt_rq->highest_prio.curr != prev_prio) + cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } +#else /* CONFIG_SMP */ + static inline -void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -#ifdef CONFIG_SMP - struct rq *rq = rq_of_rt_rq(rt_rq); - int highest_prio = rt_rq->highest_prio.curr; -#endif +void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} +static inline +void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} + +#endif /* CONFIG_SMP */ - WARN_ON(!rt_prio(rt_se_prio(rt_se))); - WARN_ON(!rt_rq->rt_nr_running); - rt_rq->rt_nr_running--; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +static void +inc_rt_prio(struct rt_rq *rt_rq, int prio) +{ + int prev_prio = rt_rq->highest_prio.curr; + + if (prio < prev_prio) + rt_rq->highest_prio.curr = prio; + + inc_rt_prio_smp(rt_rq, prio, prev_prio); +} + +static void +dec_rt_prio(struct rt_rq *rt_rq, int prio) +{ + int prev_prio = rt_rq->highest_prio.curr; + if (rt_rq->rt_nr_running) { - int prio = rt_se_prio(rt_se); - WARN_ON(prio < rt_rq->highest_prio.curr); + WARN_ON(prio < prev_prio); /* - * This may have been our highest or next-highest priority - * task and therefore we may have some recomputation to do + * This may have been our highest task, and therefore + * we may have some recomputation to do */ - if (prio == rt_rq->highest_prio.curr) { + if (prio == prev_prio) { struct rt_prio_array *array = &rt_rq->active; rt_rq->highest_prio.curr = sched_find_first_bit(array->bitmap); } - if (prio <= rt_rq->highest_prio.next) - rt_rq->highest_prio.next = next_prio(rq); } else rt_rq->highest_prio.curr = MAX_RT_PRIO; -#endif -#ifdef CONFIG_SMP - if (rt_se->nr_cpus_allowed > 1) - rq->rt.rt_nr_migratory--; - if (rq->online && rt_rq->highest_prio.curr != highest_prio) - cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); + dec_rt_prio_smp(rt_rq, prio, prev_prio); +} + +#else + +static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} +static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} + +#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ - update_rt_migration(rq); -#endif /* CONFIG_SMP */ #ifdef CONFIG_RT_GROUP_SCHED + +static void +inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + if (rt_se_boosted(rt_se)) + rt_rq->rt_nr_boosted++; + + if (rt_rq->tg) + start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); +} + +static void +dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ if (rt_se_boosted(rt_se)) rt_rq->rt_nr_boosted--; WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); -#endif +} + +#else /* CONFIG_RT_GROUP_SCHED */ + +static void +inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + start_rt_bandwidth(&def_rt_bandwidth); +} + +static inline +void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} + +#endif /* CONFIG_RT_GROUP_SCHED */ + +static inline +void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + int prio = rt_se_prio(rt_se); + + WARN_ON(!rt_prio(prio)); + rt_rq->rt_nr_running++; + + inc_rt_prio(rt_rq, prio); + inc_rt_migration(rt_se, rt_rq); + inc_rt_group(rt_se, rt_rq); +} + +static inline +void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + WARN_ON(!rt_prio(rt_se_prio(rt_se))); + WARN_ON(!rt_rq->rt_nr_running); + rt_rq->rt_nr_running--; + + dec_rt_prio(rt_rq, rt_se_prio(rt_se)); + dec_rt_migration(rt_se, rt_rq); + dec_rt_group(rt_se, rt_rq); } static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) @@ -1453,7 +1529,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, rq->rt.rt_nr_migratory--; } - update_rt_migration(rq); + update_rt_migration(&rq->rt); } cpumask_copy(&p->cpus_allowed, new_mask); -- cgit v1.1 From 831451ac4e44d3a20b581ce726ef1d1144373f7d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Jan 2009 12:39:18 +0100 Subject: sched: introduce avg_wakeup Introduce a new avg_wakeup statistic. avg_wakeup is a measure of how frequently a task wakes up other tasks, it represents the average time between wakeups, with a limit of avg_runtime for when it doesn't wake up anybody. Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 36 ++++++++++++++++++++++++++++++------ kernel/sched_debug.c | 1 + 2 files changed, 31 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8be2c13..86f5a06 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1705,6 +1705,9 @@ static void update_avg(u64 *avg, u64 sample) static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) { + if (wakeup) + p->se.start_runtime = p->se.sum_exec_runtime; + sched_info_queued(p); p->sched_class->enqueue_task(rq, p, wakeup); p->se.on_rq = 1; @@ -1712,10 +1715,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) { - if (sleep && p->se.last_wakeup) { - update_avg(&p->se.avg_overlap, - p->se.sum_exec_runtime - p->se.last_wakeup); - p->se.last_wakeup = 0; + if (sleep) { + if (p->se.last_wakeup) { + update_avg(&p->se.avg_overlap, + p->se.sum_exec_runtime - p->se.last_wakeup); + p->se.last_wakeup = 0; + } else { + update_avg(&p->se.avg_wakeup, + sysctl_sched_wakeup_granularity); + } } sched_info_dequeued(p); @@ -2345,6 +2353,22 @@ out_activate: activate_task(rq, p, 1); success = 1; + /* + * Only attribute actual wakeups done by this task. + */ + if (!in_interrupt()) { + struct sched_entity *se = ¤t->se; + u64 sample = se->sum_exec_runtime; + + if (se->last_wakeup) + sample -= se->last_wakeup; + else + sample -= se->start_runtime; + update_avg(&se->avg_wakeup, sample); + + se->last_wakeup = se->sum_exec_runtime; + } + out_running: trace_sched_wakeup(rq, p, success); check_preempt_curr(rq, p, sync); @@ -2355,8 +2379,6 @@ out_running: p->sched_class->task_wake_up(rq, p); #endif out: - current->se.last_wakeup = current->se.sum_exec_runtime; - task_rq_unlock(rq, &flags); return success; @@ -2386,6 +2408,8 @@ static void __sched_fork(struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.last_wakeup = 0; p->se.avg_overlap = 0; + p->se.start_runtime = 0; + p->se.avg_wakeup = sysctl_sched_wakeup_granularity; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 16eeba4e..2b1260f 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -397,6 +397,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.vruntime); PN(se.sum_exec_runtime); PN(se.avg_overlap); + PN(se.avg_wakeup); nr_switches = p->nvcsw + p->nivcsw; -- cgit v1.1 From e52fb7c097238d34f4d8e2a596f8a3f85b0c0565 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Jan 2009 12:39:19 +0100 Subject: sched: prefer wakers Prefer tasks that wake other tasks to preempt quickly. This improves performance because more work is available sooner. The workload that prompted this patch was a kernel build over NFS4 (for some curious and not understood reason we had to revert commit: 18de9735300756e3ca9c361ef58409d8561dfe0d to make any progress at all) Without this patch a make -j8 bzImage (of x86-64 defconfig) would take 3m30-ish, with this patch we're down to 2m50-ish. psql-sysbench/mysql-sysbench show a slight improvement in peak performance as well, tbench and vmark seemed to not care. It is possible to improve upon the build time (to 2m20-ish) but that seriously destroys other benchmarks (just shows that there's more room for tinkering). Much thanks to Mike who put in a lot of effort to benchmark things and proved a worthy opponent with a competing patch. Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 59 ++++++++++++++++++++++++++++++++++++++++++++----- kernel/sched_features.h | 3 ++- 2 files changed, 55 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 8e1352c..bdf6434 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1295,16 +1295,63 @@ out: } #endif /* CONFIG_SMP */ -static unsigned long wakeup_gran(struct sched_entity *se) +/* + * Adaptive granularity + * + * se->avg_wakeup gives the average time a task runs until it does a wakeup, + * with the limit of wakeup_gran -- when it never does a wakeup. + * + * So the smaller avg_wakeup is the faster we want this task to preempt, + * but we don't want to treat the preemptee unfairly and therefore allow it + * to run for at least the amount of time we'd like to run. + * + * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one + * + * NOTE: we use *nr_running to scale with load, this nicely matches the + * degrading latency on load. + */ +static unsigned long +adaptive_gran(struct sched_entity *curr, struct sched_entity *se) +{ + u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running; + u64 gran = 0; + + if (this_run < expected_wakeup) + gran = expected_wakeup - this_run; + + return min_t(s64, gran, sysctl_sched_wakeup_granularity); +} + +static unsigned long +wakeup_gran(struct sched_entity *curr, struct sched_entity *se) { unsigned long gran = sysctl_sched_wakeup_granularity; + if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN)) + gran = adaptive_gran(curr, se); + /* - * More easily preempt - nice tasks, while not making it harder for - * + nice tasks. + * Since its curr running now, convert the gran from real-time + * to virtual-time in his units. */ - if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD) - gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); + if (sched_feat(ASYM_GRAN)) { + /* + * By using 'se' instead of 'curr' we penalize light tasks, so + * they get preempted easier. That is, if 'se' < 'curr' then + * the resulting gran will be larger, therefore penalizing the + * lighter, if otoh 'se' > 'curr' then the resulting gran will + * be smaller, again penalizing the lighter task. + * + * This is especially important for buddies when the leftmost + * task is higher priority than the buddy. + */ + if (unlikely(se->load.weight != NICE_0_LOAD)) + gran = calc_delta_fair(gran, se); + } else { + if (unlikely(curr->load.weight != NICE_0_LOAD)) + gran = calc_delta_fair(gran, curr); + } return gran; } @@ -1331,7 +1378,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) if (vdiff <= 0) return -1; - gran = wakeup_gran(curr); + gran = wakeup_gran(curr, se); if (vdiff > gran) return 1; diff --git a/kernel/sched_features.h b/kernel/sched_features.h index da5d93b..76f6175 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -1,5 +1,6 @@ SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) -SCHED_FEAT(NORMALIZED_SLEEPER, 1) +SCHED_FEAT(NORMALIZED_SLEEPER, 0) +SCHED_FEAT(ADAPTIVE_GRAN, 1) SCHED_FEAT(WAKEUP_PREEMPT, 1) SCHED_FEAT(START_DEBIT, 1) SCHED_FEAT(AFFINE_WAKEUPS, 1) -- cgit v1.1 From 2d68259db26ad57fd9643f1c69b5181ec9836ca9 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 16 Jan 2009 17:14:38 +0900 Subject: clockevents: let set_mode() setup delta information Allow the set_mode() clockevent callback to decide and fill in delta details such as shift, mult, max_delta_ns and min_delta_ns. With this change the clockevent can be registered without delta details which allows us to keep the parent clock disabled until the clockevent gets setup using set_mode(). Letting set_mode() fill in or update delta details allows us to save power by disabling the parent clock while the clockevent is unused. This may however make the parent clock rate change, so next time the clockevent gets enabled we need let set_mode() to update the detla details accordingly. Doing it at registration time is not enough. Furthermore, the delta details seem unused in the case of periodic-only clockevent drivers, so this change also allows registration of such drivers without the delta details filled in. Signed-off-by: Magnus Damm Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index ea2f48a..d13be21 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -68,6 +68,17 @@ void clockevents_set_mode(struct clock_event_device *dev, if (dev->mode != mode) { dev->set_mode(mode, dev); dev->mode = mode; + + /* + * A nsec2cyc multiplicator of 0 is invalid and we'd crash + * on it, so fix it up and emit a warning: + */ + if (mode == CLOCK_EVT_MODE_ONESHOT) { + if (unlikely(!dev->mult)) { + dev->mult = 1; + WARN_ON(1); + } + } } } @@ -168,15 +179,6 @@ void clockevents_register_device(struct clock_event_device *dev) BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); BUG_ON(!dev->cpumask); - /* - * A nsec2cyc multiplicator of 0 is invalid and we'd crash - * on it, so fix it up and emit a warning: - */ - if (unlikely(!dev->mult)) { - dev->mult = 1; - WARN_ON(1); - } - spin_lock(&clockevents_lock); list_add(&dev->list, &clockevent_devices); -- cgit v1.1 From ceacc2c1c85ac498ca4cf297bdfe5b4aaa9fd0e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 16 Jan 2009 14:46:40 +0100 Subject: sched: make plist a library facility Ingo Molnar wrote: > here's a new build failure with tip/sched/rt: > > LD .tmp_vmlinux1 > kernel/built-in.o: In function `set_curr_task_rt': > sched.c:(.text+0x3675): undefined reference to `plist_del' > kernel/built-in.o: In function `pick_next_task_rt': > sched.c:(.text+0x37ce): undefined reference to `plist_del' > kernel/built-in.o: In function `enqueue_pushable_task': > sched.c:(.text+0x381c): undefined reference to `plist_del' Eliminate the plist library kconfig and make it available unconditionally. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 4230b15..48d1f6e 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -114,14 +114,23 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) #else +static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) +{ +} + static inline -void enqueue_pushable_task(struct rq *rq, struct task_struct *p) {} -static inline -void dequeue_pushable_task(struct rq *rq, struct task_struct *p) {} -static inline -void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} +void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +} + static inline -void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} +void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +} #endif /* CONFIG_SMP */ -- cgit v1.1 From 74296a8ed6aa3c5bf672808ada690de7ba323ecc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 16 Jan 2009 17:43:50 +0100 Subject: irq: provide debug_poll_all_shared_irqs() method under CONFIG_DEBUG_SHIRQ Provide a shared interrupt debug facility under CONFIG_DEBUG_SHIRQ: it uses the existing irqpoll facilities to iterate through all registered interrupt handlers and call those which can handle shared IRQ lines. This can be handy for suspend/resume debugging: if we call this function early during resume we can trigger crashes in those drivers which have incorrect assumptions about when exactly their ISRs will be called during suspend/resume. Signed-off-by: Ingo Molnar --- kernel/irq/spurious.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dd364c1..4d56829 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -104,7 +104,7 @@ static int misrouted_irq(int irq) return ok; } -static void poll_spurious_irqs(unsigned long dummy) +static void poll_all_shared_irqs(void) { struct irq_desc *desc; int i; @@ -123,11 +123,23 @@ static void poll_spurious_irqs(unsigned long dummy) try_one_irq(i, desc); } +} + +static void poll_spurious_irqs(unsigned long dummy) +{ + poll_all_shared_irqs(); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } +#ifdef CONFIG_DEBUG_SHIRQ +void debug_poll_all_shared_irqs(void) +{ + poll_all_shared_irqs(); +} +#endif + /* * If 99,900 of the previous 100,000 interrupts have not been handled * then assume that the IRQ is stuck in some manner. Drop a diagnostic -- cgit v1.1 From 68564a46976017496c2227660930d81240f82355 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 16 Jan 2009 15:31:15 -0800 Subject: work_on_cpu: don't try to get_online_cpus() in work_on_cpu. Impact: remove potential circular lock dependency with cpu hotplug lock This has caused more problems than it solved, with a pile of cpu hotplug locking issues. Followup patches will get_online_cpus() in callers that need it, but if they don't do it they're no worse than before when they were using set_cpus_allowed without locking. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- kernel/workqueue.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2f44583..a35afdb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -991,8 +991,8 @@ static void do_work_for_cpu(struct work_struct *w) * @fn: the function to run * @arg: the function arg * - * This will return -EINVAL in the cpu is not online, or the return value - * of @fn otherwise. + * This will return the value @fn returns. + * It is up to the caller to ensure that the cpu doesn't go offline. */ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) { @@ -1001,14 +1001,8 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) INIT_WORK(&wfc.work, do_work_for_cpu); wfc.fn = fn; wfc.arg = arg; - get_online_cpus(); - if (unlikely(!cpu_online(cpu))) - wfc.ret = -EINVAL; - else { - schedule_work_on(cpu, &wfc.work); - flush_work(&wfc.work); - } - put_online_cpus(); + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); return wfc.ret; } -- cgit v1.1 From e1d9ec6246a2668a5d037f529877efb7cf176af8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 16 Jan 2009 15:31:15 -0800 Subject: work_on_cpu: Use our own workqueue. Impact: remove potential clashes with generic kevent workqueue Annoyingly, some places we want to use work_on_cpu are already in workqueues. As per Ingo's suggestion, we create a different workqueue for work_on_cpu. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- kernel/workqueue.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a35afdb..1f0c509 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -971,6 +971,8 @@ undo: } #ifdef CONFIG_SMP +static struct workqueue_struct *work_on_cpu_wq __read_mostly; + struct work_for_cpu { struct work_struct work; long (*fn)(void *); @@ -1001,7 +1003,7 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) INIT_WORK(&wfc.work, do_work_for_cpu); wfc.fn = fn; wfc.arg = arg; - schedule_work_on(cpu, &wfc.work); + queue_work_on(cpu, work_on_cpu_wq, &wfc.work); flush_work(&wfc.work); return wfc.ret; @@ -1019,4 +1021,8 @@ void __init init_workqueues(void) hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); +#ifdef CONFIG_SMP + work_on_cpu_wq = create_workqueue("work_on_cpu"); + BUG_ON(!work_on_cpu_wq); +#endif } -- cgit v1.1 From 6626bff24578753808c8b5bd4f1619e14e980f0f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Jan 2009 11:31:36 +0100 Subject: hrtimer: prevent negative expiry value after clock_was_set() Impact: prevent false positive WARN_ON() in clockevents_program_event() clock_was_set() changes the base->offset of CLOCK_REALTIME and enforces the reprogramming of the clockevent device to expire timers which are based on CLOCK_REALTIME. If the clock change is large enough then the subtraction of the timer expiry value and base->offset can become negative which triggers the warning in clockevents_program_event(). Check the subtraction result and set a negative value to 0. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 2c40ee8..d71cef2 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -501,6 +501,13 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) continue; timer = rb_entry(base->first, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + /* + * clock_was_set() has changed base->offset so the + * result might be negative. Fix it up to prevent a + * false positive in clockevents_program_event() + */ + if (expires.tv64 < 0) + expires.tv64 = 0; if (expires.tv64 < cpu_base->expires_next.tv64) cpu_base->expires_next = expires; } -- cgit v1.1 From eefef1cf7653cd4e0aaf743c00ae8345086cdc01 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 1 Feb 2009 01:04:33 -0800 Subject: net: add ARP notify option for devices This adds another inet device option to enable gratuitous ARP when device is brought up or address change. This is handy for clusters or virtualization. Signed-off-by: Stephen Hemminger Signed-off-by: Jeremy Fitzhardinge Signed-off-by: David S. Miller --- kernel/sysctl_check.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index fafeb48..b38423c 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -219,6 +219,7 @@ static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = { { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, + { NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" }, {} }; -- cgit v1.1 From 0f3c2a89c1451cdf6328f99977bd9decd4f708e1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 8 Feb 2009 16:18:03 -0800 Subject: irq: clear kstat_irqs Impact: get correct kstat_irqs [/proc/interrupts] for msi/msi-x etc need to call clear_kstat_irqs(), so when we reuse that irq_desc, we get correct kstat in /proc/interrupts. This makes /proc/interrupts not have entries. Don't need to worry about arch that doesn't support genirq, because they will not call dynamic_irq_cleanup(). v2: simplify and make clear_kstat_irqs more robust Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 1 + kernel/irq/handle.c | 5 +++++ kernel/irq/internals.h | 1 + 3 files changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f63c706..1310856 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -78,6 +78,7 @@ void dynamic_irq_cleanup(unsigned int irq) desc->handle_irq = handle_bad_irq; desc->chip = &no_irq_chip; desc->name = NULL; + clear_kstat_irqs(desc); spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 48299a8..1b473e7 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -242,6 +242,11 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) } #endif /* !CONFIG_SPARSE_IRQ */ +void clear_kstat_irqs(struct irq_desc *desc) +{ + memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); +} + /* * What should we do if we get a hw irq event on an illegal vector? * Each architecture has to answer this themself. diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index e6d0a43..b60950b 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -15,6 +15,7 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, extern struct lock_class_key irq_desc_lock_class; extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); +extern void clear_kstat_irqs(struct irq_desc *desc); extern spinlock_t sparse_irq_lock; extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; -- cgit v1.1 From 005bf0e6fa0e9543933fe2e36322af649df7cacb Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 8 Feb 2009 16:18:03 -0800 Subject: irq: optimize init_kstat_irqs/init_copy_kstat_irqs Simplify and make init_kstat_irqs etc more type proof, suggested by Andrew. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 20 +++++++++++--------- kernel/irq/numa_migrate.c | 11 +++-------- 2 files changed, 14 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 1b473e7..49d642b 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -71,19 +71,21 @@ static struct irq_desc irq_desc_init = { void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) { - unsigned long bytes; - char *ptr; int node; - - /* Compute how many bytes we need per irq and allocate them */ - bytes = nr * sizeof(unsigned int); + void *ptr; node = cpu_to_node(cpu); - ptr = kzalloc_node(bytes, GFP_ATOMIC, node); - printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", cpu, node); + ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); - if (ptr) - desc->kstat_irqs = (unsigned int *)ptr; + /* + * don't overwite if can not get new one + * init_copy_kstat_irqs() could still use old one + */ + if (ptr) { + printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", + cpu, node); + desc->kstat_irqs = ptr; + } } static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index ecf765c..c500cfe 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -17,16 +17,11 @@ static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc, int cpu, int nr) { - unsigned long bytes; - init_kstat_irqs(desc, cpu, nr); - if (desc->kstat_irqs != old_desc->kstat_irqs) { - /* Compute how many bytes we need per irq and allocate them */ - bytes = nr * sizeof(unsigned int); - - memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes); - } + if (desc->kstat_irqs != old_desc->kstat_irqs) + memcpy(desc->kstat_irqs, old_desc->kstat_irqs, + nr * sizeof(*desc->kstat_irqs)); } static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) -- cgit v1.1 From 548c8933801c9ee347b6f1bad2491e4286a4f3a2 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sun, 8 Feb 2009 20:24:47 +0100 Subject: kernel/irq: fix sparse warning: make symbol static While being at it make every occurrence of 'do_irq_select_affinity' have the same signature in terms of signedness of the first argument. Fix this sparse warning: kernel/irq/manage.c:112:5: warning: symbol 'do_irq_select_affinity' was not declared. Should it be static? Also rename do_irq_select_affinity() to setup_affinity() - shorter name and clearer naming. Signed-off-by: Hannes Eder Acked-by: Matthew Wilcox Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 291f036..38008b8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -109,7 +109,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) /* * Generic version of the affinity autoselector. */ -int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) +static int setup_affinity(unsigned int irq, struct irq_desc *desc) { if (!irq_can_set_affinity(irq)) return 0; @@ -133,7 +133,7 @@ set_affinity: return 0; } #else -static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d) +static inline int setup_affinity(unsigned int irq, struct irq_desc *d) { return irq_select_affinity(irq); } @@ -149,14 +149,14 @@ int irq_select_affinity_usr(unsigned int irq) int ret; spin_lock_irqsave(&desc->lock, flags); - ret = do_irq_select_affinity(irq, desc); + ret = setup_affinity(irq, desc); spin_unlock_irqrestore(&desc->lock, flags); return ret; } #else -static inline int do_irq_select_affinity(int irq, struct irq_desc *desc) +static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) { return 0; } @@ -488,7 +488,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) desc->status |= IRQ_NO_BALANCING; /* Set default affinity mask once everything is setup */ - do_irq_select_affinity(irq, desc); + setup_affinity(irq, desc); } else if ((new->flags & IRQF_TRIGGER_MASK) && (new->flags & IRQF_TRIGGER_MASK) -- cgit v1.1 From 6cd61c0baa8bce32271226198b46c67a7a05d108 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Feb 2009 22:17:39 +0900 Subject: elf: add ELF_CORE_COPY_KERNEL_REGS() ELF core dump is used for both user land core dump and kernel crash dump. Depending on architecture, register might need to be accessed differently for userland and kernel. Allow architectures to define ELF_CORE_COPY_KERNEL_REGS() and use different operation for kernel register dump. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- kernel/kexec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 8a6d7b0..795e7b6 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1130,7 +1130,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) return; memset(&prstatus, 0, sizeof(prstatus)); prstatus.pr_pid = current->pid; - elf_core_copy_regs(&prstatus.pr_reg, regs); + elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, &prstatus, sizeof(prstatus)); final_note(buf); -- cgit v1.1 From 5d707e9c8ef2a3596ed5c975c6ff05cec890c2b4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Feb 2009 22:17:39 +0900 Subject: stackprotector: update make rules Impact: no default -fno-stack-protector if stackp is enabled, cleanup Stackprotector make rules had the following problems. * cc support test and warning are scattered across makefile and kernel/panic.c. * -fno-stack-protector was always added regardless of configuration. Update such that cc support test and warning are contained in makefile and -fno-stack-protector is added iff stackp is turned off. While at it, prepare for 32bit support. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- kernel/panic.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 33cab3d..32fe4ef 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -359,10 +359,6 @@ EXPORT_SYMBOL(warn_slowpath); #ifdef CONFIG_CC_STACKPROTECTOR -#ifndef GCC_HAS_SP -#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this. -#endif - /* * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value -- cgit v1.1 From ad0b0fd554dfc126b5750d14908dccc3bbf602be Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 10 Feb 2009 11:42:26 -0800 Subject: sched, latencytop: incorporate review feedback from Andrew Morton Andrew had some suggestions for the latencytop file; this patch takes care of most of these: * Add documentation * Turn account_scheduler_latency into an inline function * Don't report negative values to userspace * Make the file operations struct const * Fix a few checkpatch.pl warnings Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/latencytop.c | 83 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 71 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 449db46..ca07c5c 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -9,6 +9,44 @@ * as published by the Free Software Foundation; version 2 * of the License. */ + +/* + * CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is + * used by the "latencytop" userspace tool. The latency that is tracked is not + * the 'traditional' interrupt latency (which is primarily caused by something + * else consuming CPU), but instead, it is the latency an application encounters + * because the kernel sleeps on its behalf for various reasons. + * + * This code tracks 2 levels of statistics: + * 1) System level latency + * 2) Per process latency + * + * The latency is stored in fixed sized data structures in an accumulated form; + * if the "same" latency cause is hit twice, this will be tracked as one entry + * in the data structure. Both the count, total accumulated latency and maximum + * latency are tracked in this data structure. When the fixed size structure is + * full, no new causes are tracked until the buffer is flushed by writing to + * the /proc file; the userspace tool does this on a regular basis. + * + * A latency cause is identified by a stringified backtrace at the point that + * the scheduler gets invoked. The userland tool will use this string to + * identify the cause of the latency in human readable form. + * + * The information is exported via /proc/latency_stats and /proc//latency. + * These files look like this: + * + * Latency Top version : v0.1 + * 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl + * | | | | + * | | | +----> the stringified backtrace + * | | +---------> The maximum latency for this entry in microseconds + * | +--------------> The accumulated latency for this entry (microseconds) + * +-------------------> The number of times this entry is hit + * + * (note: the average latency is the accumulated latency divided by the number + * of times) + */ + #include #include #include @@ -72,7 +110,7 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record firstnonnull = i; continue; } - for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { + for (q = 0; q < LT_BACKTRACEDEPTH; q++) { unsigned long record = lat->backtrace[q]; if (latency_record[i].backtrace[q] != record) { @@ -101,31 +139,52 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record memcpy(&latency_record[i], lat, sizeof(struct latency_record)); } -static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat) +/* + * Iterator to store a backtrace into a latency record entry + */ +static inline void store_stacktrace(struct task_struct *tsk, + struct latency_record *lat) { struct stack_trace trace; memset(&trace, 0, sizeof(trace)); trace.max_entries = LT_BACKTRACEDEPTH; trace.entries = &lat->backtrace[0]; - trace.skip = 0; save_stack_trace_tsk(tsk, &trace); } +/** + * __account_scheduler_latency - record an occured latency + * @tsk - the task struct of the task hitting the latency + * @usecs - the duration of the latency in microseconds + * @inter - 1 if the sleep was interruptible, 0 if uninterruptible + * + * This function is the main entry point for recording latency entries + * as called by the scheduler. + * + * This function has a few special cases to deal with normal 'non-latency' + * sleeps: specifically, interruptible sleep longer than 5 msec is skipped + * since this usually is caused by waiting for events via select() and co. + * + * Negative latencies (caused by time going backwards) are also explicitly + * skipped. + */ void __sched -account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) +__account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) { unsigned long flags; int i, q; struct latency_record lat; - if (!latencytop_enabled) - return; - /* Long interruptible waits are generally user requested... */ if (inter && usecs > 5000) return; + /* Negative sleeps are time going backwards */ + /* Zero-time sleeps are non-interesting */ + if (usecs <= 0) + return; + memset(&lat, 0, sizeof(lat)); lat.count = 1; lat.time = usecs; @@ -143,12 +202,12 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) if (tsk->latency_record_count >= LT_SAVECOUNT) goto out_unlock; - for (i = 0; i < LT_SAVECOUNT ; i++) { + for (i = 0; i < LT_SAVECOUNT; i++) { struct latency_record *mylat; int same = 1; mylat = &tsk->latency_record[i]; - for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { + for (q = 0; q < LT_BACKTRACEDEPTH; q++) { unsigned long record = lat.backtrace[q]; if (mylat->backtrace[q] != record) { @@ -186,7 +245,7 @@ static int lstats_show(struct seq_file *m, void *v) for (i = 0; i < MAXLR; i++) { if (latency_record[i].backtrace[0]) { int q; - seq_printf(m, "%i %li %li ", + seq_printf(m, "%i %lu %lu ", latency_record[i].count, latency_record[i].time, latency_record[i].max); @@ -223,7 +282,7 @@ static int lstats_open(struct inode *inode, struct file *filp) return single_open(filp, lstats_show, NULL); } -static struct file_operations lstats_fops = { +static const struct file_operations lstats_fops = { .open = lstats_open, .read = seq_read, .write = lstats_write, @@ -236,4 +295,4 @@ static int __init init_lstats_procfs(void) proc_create("latency_stats", 0644, NULL, &lstats_fops); return 0; } -__initcall(init_lstats_procfs); +device_initcall(init_lstats_procfs); -- cgit v1.1 From 0e43785c57fee50fbc00ea0378e941efb61fa0c2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 13 Feb 2009 04:38:04 +0100 Subject: irq: use GFP_KERNEL for action allocation in request_irq() request_irq() calls into proc code via __setup_irq() which is not safe in an atomic context, so request_irq() can itself use the more reliable GFP_KERNEL allocation for the action descriptor. Signed-off-by: Johannes Weiner Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index cd0cd8d..1c50550 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -717,7 +717,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, if (!handler) return -EINVAL; - action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC); + action = kmalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; -- cgit v1.1 From 327ec5699c29454322d0136375f717f509c145b6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 15 Feb 2009 11:21:37 +0100 Subject: irq: clean up manage.c - make printk message git-greppable - fix a few style details Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1c50550..8f4bc61 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -397,7 +397,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, * allocate special interrupts that are part of the architecture. */ static int -__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) +__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **p; const char *old_name = NULL; @@ -687,11 +687,12 @@ int request_irq(unsigned int irq, irq_handler_t handler, * the behavior is classified as "will not fix" so we need to * start nudging drivers away from using that idiom. */ - if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) - == (IRQF_SHARED|IRQF_DISABLED)) - pr_warning("IRQ %d/%s: IRQF_DISABLED is not " - "guaranteed on shared IRQs\n", - irq, devname); + if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) == + (IRQF_SHARED|IRQF_DISABLED)) { + pr_warning( + "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n", + irq, devname); + } #ifdef CONFIG_LOCKDEP /* -- cgit v1.1 From ae88a23b32fa7e0dc9fa7ce735966e68eb41b0bc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 15 Feb 2009 11:29:50 +0100 Subject: irq: refactor and clean up the free_irq() code flow Impact: cleanup - separate out the loop from the actual freeing logic, this wins us two indentation levels allowing a number of followup prettifications - turn the WARN_ON() into a more informative WARN(). - clean up the comments and the code flow some more Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 101 ++++++++++++++++++++++++++++------------------------ 1 file changed, 54 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8f4bc61..7a954b8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -575,72 +575,79 @@ int setup_irq(unsigned int irq, struct irqaction *act) void free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); - struct irqaction **p; + struct irqaction *action, **p, **pp; unsigned long flags; - WARN_ON(in_interrupt()); + WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); if (!desc) return; spin_lock_irqsave(&desc->lock, flags); + + /* + * There can be multiple actions per IRQ descriptor, find the right + * one based on the dev_id: + */ p = &desc->action; for (;;) { - struct irqaction *action = *p; + action = *p; + pp = p; + + if (!action) { + WARN(1, "Trying to free already-free IRQ %d\n", irq); + spin_unlock_irqrestore(&desc->lock, flags); + + return; + } - if (action) { - struct irqaction **pp = p; + p = &action->next; + if (action->dev_id != dev_id) + continue; - p = &action->next; - if (action->dev_id != dev_id) - continue; + break; + } - /* Found it - now remove it from the list of entries */ - *pp = action->next; + /* Found it - now remove it from the list of entries: */ + *pp = action->next; - /* Currently used only by UML, might disappear one day.*/ + /* Currently used only by UML, might disappear one day: */ #ifdef CONFIG_IRQ_RELEASE_METHOD - if (desc->chip->release) - desc->chip->release(irq, dev_id); + if (desc->chip->release) + desc->chip->release(irq, dev_id); #endif - if (!desc->action) { - desc->status |= IRQ_DISABLED; - if (desc->chip->shutdown) - desc->chip->shutdown(irq); - else - desc->chip->disable(irq); - } - spin_unlock_irqrestore(&desc->lock, flags); - unregister_handler_proc(irq, action); + /* If this was the last handler, shut down the IRQ line: */ + if (!desc->action) { + desc->status |= IRQ_DISABLED; + if (desc->chip->shutdown) + desc->chip->shutdown(irq); + else + desc->chip->disable(irq); + } + spin_unlock_irqrestore(&desc->lock, flags); + + unregister_handler_proc(irq, action); + + /* Make sure it's not being used on another CPU: */ + synchronize_irq(irq); - /* Make sure it's not being used on another CPU */ - synchronize_irq(irq); -#ifdef CONFIG_DEBUG_SHIRQ - /* - * It's a shared IRQ -- the driver ought to be - * prepared for it to happen even now it's - * being freed, so let's make sure.... We do - * this after actually deregistering it, to - * make sure that a 'real' IRQ doesn't run in - * parallel with our fake - */ - if (action->flags & IRQF_SHARED) { - local_irq_save(flags); - action->handler(irq, dev_id); - local_irq_restore(flags); - } -#endif - kfree(action); - return; - } - printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); #ifdef CONFIG_DEBUG_SHIRQ - dump_stack(); -#endif - spin_unlock_irqrestore(&desc->lock, flags); - return; + /* + * It's a shared IRQ -- the driver ought to be prepared for an IRQ + * event to happen even now it's being freed, so let's make sure that + * is so by doing an extra call to the handler .... + * + * ( We do this after actually deregistering it, to make sure that a + * 'real' IRQ doesn't run in * parallel with our fake. ) + */ + if (action->flags & IRQF_SHARED) { + local_irq_save(flags); + action->handler(irq, dev_id); + local_irq_restore(flags); } +#endif + kfree(action); } EXPORT_SYMBOL(free_irq); -- cgit v1.1 From a0a522ce3d6d8c907e45d4f2730ee8573484cc88 Mon Sep 17 00:00:00 2001 From: Henrik Austad Date: Fri, 13 Feb 2009 20:35:45 +0100 Subject: sched: idle_at_tick is only used when CONFIG_SMP is set Impact: struct rq size optimization The idle_at_tick in struct rq is only used in SMP settings and it does not make sense to have this in the rq in an UP setup. Signed-off-by: Henrik Austad Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5faf5d4..648154c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -555,7 +555,6 @@ struct rq { unsigned long nr_running; #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; - unsigned char idle_at_tick; #ifdef CONFIG_NO_HZ unsigned long last_tick_seen; unsigned char in_nohz_recently; @@ -596,6 +595,7 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; + unsigned char idle_at_tick; /* For active balancing */ int active_balance; int push_cpu; -- cgit v1.1 From a038a353c3de4040d8445ec568acebdac144436f Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Thu, 12 Feb 2009 05:03:34 +0000 Subject: clocksource: allow usage independent of timekeeping.c So far struct clocksource acted as the interface between time/timekeeping.c and hardware. This patch generalizes the concept so that a similar interface can also be used in other contexts. For that it introduces new structures and related functions *without* touching the existing struct clocksource. The reasons for adding these new structures to clocksource.[ch] are * the APIs are clearly related * struct clocksource could be cleaned up to use the new structs * avoids proliferation of files with similar names (timesource.h? timecounter.h?) As outlined in the discussion with John Stultz, this patch adds * struct cyclecounter: stateless API to hardware which counts clock cycles * struct timecounter: stateful utility code built on a cyclecounter which provides a nanosecond counter * only the function to read the nanosecond counter; deltas are used internally and not exposed to users of timecounter The code does no locking of the shared state. It must be called at least as often as the cycle counter wraps around to detect these wrap arounds. Both is the responsibility of the timecounter user. Acked-by: John Stultz Signed-off-by: Patrick Ohly Signed-off-by: David S. Miller --- kernel/time/clocksource.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ca89e15..c46c931 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -31,6 +31,82 @@ #include /* for spin_unlock_irq() using preempt_count() m68k */ #include +void timecounter_init(struct timecounter *tc, + const struct cyclecounter *cc, + u64 start_tstamp) +{ + tc->cc = cc; + tc->cycle_last = cc->read(cc); + tc->nsec = start_tstamp; +} +EXPORT_SYMBOL(timecounter_init); + +/** + * timecounter_read_delta - get nanoseconds since last call of this function + * @tc: Pointer to time counter + * + * When the underlying cycle counter runs over, this will be handled + * correctly as long as it does not run over more than once between + * calls. + * + * The first call to this function for a new time counter initializes + * the time tracking and returns an undefined result. + */ +static u64 timecounter_read_delta(struct timecounter *tc) +{ + cycle_t cycle_now, cycle_delta; + u64 ns_offset; + + /* read cycle counter: */ + cycle_now = tc->cc->read(tc->cc); + + /* calculate the delta since the last timecounter_read_delta(): */ + cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; + + /* convert to nanoseconds: */ + ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta); + + /* update time stamp of timecounter_read_delta() call: */ + tc->cycle_last = cycle_now; + + return ns_offset; +} + +u64 timecounter_read(struct timecounter *tc) +{ + u64 nsec; + + /* increment time by nanoseconds since last call */ + nsec = timecounter_read_delta(tc); + nsec += tc->nsec; + tc->nsec = nsec; + + return nsec; +} +EXPORT_SYMBOL(timecounter_read); + +u64 timecounter_cyc2time(struct timecounter *tc, + cycle_t cycle_tstamp) +{ + u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; + u64 nsec; + + /* + * Instead of always treating cycle_tstamp as more recent + * than tc->cycle_last, detect when it is too far in the + * future and treat it as old time stamp instead. + */ + if (cycle_delta > tc->cc->mask / 2) { + cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; + nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta); + } else { + nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec; + } + + return nsec; +} +EXPORT_SYMBOL(timecounter_cyc2time); + /* XXX - Would like a better way for initializing curr_clocksource */ extern struct clocksource clocksource_jiffies; -- cgit v1.1 From a75244c3d519fcb490ca2bf3f123c98017f1e8d0 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Thu, 12 Feb 2009 05:03:35 +0000 Subject: timecompare: generic infrastructure to map between two time bases Mapping from a struct timecounter to a time returned by functions like ktime_get_real() is implemented. This is sufficient to use this code in a network device driver which wants to support hardware time stamping and transformation of hardware time stamps to system time. The interface could have been made more versatile by not depending on a time counter, but this wasn't done to avoid writing glue code elsewhere. The method implemented here is the one used and analyzed under the name "assisted PTP" in the LCI PTP paper: http://www.linuxclustersinstitute.org/conferences/archive/2008/PDF/Ohly_92221.pdf Acked-by: John Stultz Signed-off-by: Patrick Ohly Signed-off-by: David S. Miller --- kernel/time/Makefile | 2 +- kernel/time/timecompare.c | 191 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 192 insertions(+), 1 deletion(-) create mode 100644 kernel/time/timecompare.c (limited to 'kernel') diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 905b0b5..0b0a636 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,4 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c new file mode 100644 index 0000000..71e7f1a --- /dev/null +++ b/kernel/time/timecompare.c @@ -0,0 +1,191 @@ +/* + * Copyright (C) 2009 Intel Corporation. + * Author: Patrick Ohly + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include + +/* + * fixed point arithmetic scale factor for skew + * + * Usually one would measure skew in ppb (parts per billion, 1e9), but + * using a factor of 2 simplifies the math. + */ +#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30) + +ktime_t timecompare_transform(struct timecompare *sync, + u64 source_tstamp) +{ + u64 nsec; + + nsec = source_tstamp + sync->offset; + nsec += (s64)(source_tstamp - sync->last_update) * sync->skew / + TIMECOMPARE_SKEW_RESOLUTION; + + return ns_to_ktime(nsec); +} +EXPORT_SYMBOL(timecompare_transform); + +int timecompare_offset(struct timecompare *sync, + s64 *offset, + u64 *source_tstamp) +{ + u64 start_source = 0, end_source = 0; + struct { + s64 offset; + s64 duration_target; + } buffer[10], sample, *samples; + int counter = 0, i; + int used; + int index; + int num_samples = sync->num_samples; + + if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { + samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); + if (!samples) { + samples = buffer; + num_samples = sizeof(buffer)/sizeof(buffer[0]); + } + } else { + samples = buffer; + } + + /* run until we have enough valid samples, but do not try forever */ + i = 0; + counter = 0; + while (1) { + u64 ts; + ktime_t start, end; + + start = sync->target(); + ts = timecounter_read(sync->source); + end = sync->target(); + + if (!i) + start_source = ts; + + /* ignore negative durations */ + sample.duration_target = ktime_to_ns(ktime_sub(end, start)); + if (sample.duration_target >= 0) { + /* + * assume symetric delay to and from source: + * average target time corresponds to measured + * source time + */ + sample.offset = + ktime_to_ns(ktime_add(end, start)) / 2 - + ts; + + /* simple insertion sort based on duration */ + index = counter - 1; + while (index >= 0) { + if (samples[index].duration_target < + sample.duration_target) + break; + samples[index + 1] = samples[index]; + index--; + } + samples[index + 1] = sample; + counter++; + } + + i++; + if (counter >= num_samples || i >= 100000) { + end_source = ts; + break; + } + } + + *source_tstamp = (end_source + start_source) / 2; + + /* remove outliers by only using 75% of the samples */ + used = counter * 3 / 4; + if (!used) + used = counter; + if (used) { + /* calculate average */ + s64 off = 0; + for (index = 0; index < used; index++) + off += samples[index].offset; + *offset = div_s64(off, used); + } + + if (samples && samples != buffer) + kfree(samples); + + return used; +} +EXPORT_SYMBOL(timecompare_offset); + +void __timecompare_update(struct timecompare *sync, + u64 source_tstamp) +{ + s64 offset; + u64 average_time; + + if (!timecompare_offset(sync, &offset, &average_time)) + return; + + if (!sync->last_update) { + sync->last_update = average_time; + sync->offset = offset; + sync->skew = 0; + } else { + s64 delta_nsec = average_time - sync->last_update; + + /* avoid division by negative or small deltas */ + if (delta_nsec >= 10000) { + s64 delta_offset_nsec = offset - sync->offset; + s64 skew; /* delta_offset_nsec * + TIMECOMPARE_SKEW_RESOLUTION / + delta_nsec */ + u64 divisor; + + /* div_s64() is limited to 32 bit divisor */ + skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION; + divisor = delta_nsec; + while (unlikely(divisor >= ((s64)1) << 32)) { + /* divide both by 2; beware, right shift + of negative value has undefined + behavior and can only be used for + the positive divisor */ + skew = div_s64(skew, 2); + divisor >>= 1; + } + skew = div_s64(skew, divisor); + + /* + * Calculate new overall skew as 4/16 the + * old value and 12/16 the new one. This is + * a rather arbitrary tradeoff between + * only using the latest measurement (0/16 and + * 16/16) and even more weight on past measurements. + */ +#define TIMECOMPARE_NEW_SKEW_PER_16 12 + sync->skew = + div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) * + sync->skew + + TIMECOMPARE_NEW_SKEW_PER_16 * skew, + 16); + sync->last_update = average_time; + sync->offset = offset; + } + } +} +EXPORT_SYMBOL(__timecompare_update); -- cgit v1.1 From 2b8f836fb196acede88b6cc772e9057e0a9c0223 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9rico=20Wang?= Date: Mon, 16 Feb 2009 18:54:21 +0800 Subject: sched: use TASK_NICE for task_struct #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) So it's better to use TASK_NICE here. Signed-off-by: WANG Cong Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 648154c..5475d56 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5236,7 +5236,7 @@ SYSCALL_DEFINE1(nice, int, increment) if (increment > 40) increment = 40; - nice = PRIO_TO_NICE(current->static_prio) + increment; + nice = TASK_NICE(current) + increment; if (nice < -20) nice = -20; if (nice > 19) -- cgit v1.1 From 8316e38100c70cd1443ac90074eccdd033aa218d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Feb 2009 20:28:29 +0100 Subject: irq: further clean up the free_irq() code flow Linus noticed that the 'pp' variable can be eliminated altogether, and the loop can be cleaned up further. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7a954b8..de5a765 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -575,7 +575,7 @@ int setup_irq(unsigned int irq, struct irqaction *act) void free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action, **p, **pp; + struct irqaction *action, **p; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -592,7 +592,6 @@ void free_irq(unsigned int irq, void *dev_id) p = &desc->action; for (;;) { action = *p; - pp = p; if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); @@ -601,15 +600,13 @@ void free_irq(unsigned int irq, void *dev_id) return; } + if (action->dev_id == dev_id) + break; p = &action->next; - if (action->dev_id != dev_id) - continue; - - break; } /* Found it - now remove it from the list of entries: */ - *pp = action->next; + *p = action->next; /* Currently used only by UML, might disappear one day: */ #ifdef CONFIG_IRQ_RELEASE_METHOD -- cgit v1.1 From f17c75453b2d195eba0a90d9f16a3ba88c85b3b4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Feb 2009 20:43:37 +0100 Subject: irq: name 'p' variables a bit better 'p' stands for pointer - make it clear in setup_irq() and free_irq() what kind of pointer it is. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index de5a765..c589305 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -399,7 +399,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { - struct irqaction *old, **p; + struct irqaction *old, **old_ptr; const char *old_name = NULL; unsigned long flags; int shared = 0; @@ -431,8 +431,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * The following block of code has to be executed atomically */ spin_lock_irqsave(&desc->lock, flags); - p = &desc->action; - old = *p; + old_ptr = &desc->action; + old = *old_ptr; if (old) { /* * Can't share interrupts unless both agree to and are @@ -455,8 +455,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* add new interrupt at end of irq queue */ do { - p = &old->next; - old = *p; + old_ptr = &old->next; + old = *old_ptr; } while (old); shared = 1; } @@ -507,7 +507,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) (int)(new->flags & IRQF_TRIGGER_MASK)); } - *p = new; + *old_ptr = new; /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; @@ -575,7 +575,7 @@ int setup_irq(unsigned int irq, struct irqaction *act) void free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action, **p; + struct irqaction *action, **action_ptr; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -589,9 +589,9 @@ void free_irq(unsigned int irq, void *dev_id) * There can be multiple actions per IRQ descriptor, find the right * one based on the dev_id: */ - p = &desc->action; + action_ptr = &desc->action; for (;;) { - action = *p; + action = *action_ptr; if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); @@ -602,11 +602,11 @@ void free_irq(unsigned int irq, void *dev_id) if (action->dev_id == dev_id) break; - p = &action->next; + action_ptr = &action->next; } /* Found it - now remove it from the list of entries: */ - *p = action->next; + *action_ptr = action->next; /* Currently used only by UML, might disappear one day: */ #ifdef CONFIG_IRQ_RELEASE_METHOD -- cgit v1.1 From 74019224ac34b044b44a31dd89a54e3477db4896 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 18 Feb 2009 12:23:29 +0100 Subject: timers: add mod_timer_pending() Impact: new timer API Based on an idea from Martin Josefsson with the help of Patrick McHardy and Stephen Hemminger: introduce the mod_timer_pending() API which is a mod_timer() offspring that is an invariant on already removed timers. (regular mod_timer() re-activates non-pending timers.) This is useful for the networking code in that it can allow unserialized mod_timer_pending() timer-forwarding calls, but a single del_timer*() will stop the timer from being reactivated again. Also while at it: - optimize the regular mod_timer() path some more, the timer-stat and a debug check was needlessly duplicated in __mod_timer(). - make the exports come straight after the function, as most other exports in timer.c already did. - eliminate __mod_timer() as an external API, change the users to mod_timer(). The regular mod_timer() code path is not impacted significantly, due to inlining optimizations and due to the simplifications. Based-on-patch-from: Stephen Hemminger Acked-by: Stephen Hemminger Cc: "David S. Miller" Cc: Patrick McHardy Cc: netdev@vger.kernel.org Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/relay.c | 2 +- kernel/timer.c | 110 ++++++++++++++++++++++++++++++++++++++------------------- 2 files changed, 74 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 9d79b78..8f2179c 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -750,7 +750,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) * from the scheduler (trying to re-grab * rq->lock), so defer it. */ - __mod_timer(&buf->timer, jiffies + 1); + mod_timer(&buf->timer, jiffies + 1); } old = buf->data; diff --git a/kernel/timer.c b/kernel/timer.c index 13dd64f..9b77fc9 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -589,11 +589,14 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, } } -int __mod_timer(struct timer_list *timer, unsigned long expires) +static inline int +__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) { struct tvec_base *base, *new_base; unsigned long flags; - int ret = 0; + int ret; + + ret = 0; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -603,6 +606,9 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) if (timer_pending(timer)) { detach_timer(timer, 0); ret = 1; + } else { + if (pending_only) + goto out_unlock; } debug_timer_activate(timer); @@ -629,42 +635,28 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) timer->expires = expires; internal_add_timer(base, timer); + +out_unlock: spin_unlock_irqrestore(&base->lock, flags); return ret; } -EXPORT_SYMBOL(__mod_timer); - /** - * add_timer_on - start a timer on a particular CPU - * @timer: the timer to be added - * @cpu: the CPU to start it on + * mod_timer_pending - modify a pending timer's timeout + * @timer: the pending timer to be modified + * @expires: new timeout in jiffies * - * This is not very scalable on SMP. Double adds are not possible. + * mod_timer_pending() is the same for pending timers as mod_timer(), + * but will not re-activate and modify already deleted timers. + * + * It is useful for unserialized use of timers. */ -void add_timer_on(struct timer_list *timer, int cpu) +int mod_timer_pending(struct timer_list *timer, unsigned long expires) { - struct tvec_base *base = per_cpu(tvec_bases, cpu); - unsigned long flags; - - timer_stats_timer_set_start_info(timer); - BUG_ON(timer_pending(timer) || !timer->function); - spin_lock_irqsave(&base->lock, flags); - timer_set_base(timer, base); - debug_timer_activate(timer); - internal_add_timer(base, timer); - /* - * Check whether the other CPU is idle and needs to be - * triggered to reevaluate the timer wheel when nohz is - * active. We are protected against the other CPU fiddling - * with the timer by holding the timer base lock. This also - * makes sure that a CPU on the way to idle can not evaluate - * the timer wheel. - */ - wake_up_idle_cpu(cpu); - spin_unlock_irqrestore(&base->lock, flags); + return __mod_timer(timer, expires, true); } +EXPORT_SYMBOL(mod_timer_pending); /** * mod_timer - modify a timer's timeout @@ -688,9 +680,6 @@ void add_timer_on(struct timer_list *timer, int cpu) */ int mod_timer(struct timer_list *timer, unsigned long expires) { - BUG_ON(!timer->function); - - timer_stats_timer_set_start_info(timer); /* * This is a common optimization triggered by the * networking code - if the timer is re-modified @@ -699,12 +688,62 @@ int mod_timer(struct timer_list *timer, unsigned long expires) if (timer->expires == expires && timer_pending(timer)) return 1; - return __mod_timer(timer, expires); + return __mod_timer(timer, expires, false); } - EXPORT_SYMBOL(mod_timer); /** + * add_timer - start a timer + * @timer: the timer to be added + * + * The kernel will do a ->function(->data) callback from the + * timer interrupt at the ->expires point in the future. The + * current time is 'jiffies'. + * + * The timer's ->expires, ->function (and if the handler uses it, ->data) + * fields must be set prior calling this function. + * + * Timers with an ->expires field in the past will be executed in the next + * timer tick. + */ +void add_timer(struct timer_list *timer) +{ + BUG_ON(timer_pending(timer)); + mod_timer(timer, timer->expires); +} +EXPORT_SYMBOL(add_timer); + +/** + * add_timer_on - start a timer on a particular CPU + * @timer: the timer to be added + * @cpu: the CPU to start it on + * + * This is not very scalable on SMP. Double adds are not possible. + */ +void add_timer_on(struct timer_list *timer, int cpu) +{ + struct tvec_base *base = per_cpu(tvec_bases, cpu); + unsigned long flags; + + timer_stats_timer_set_start_info(timer); + BUG_ON(timer_pending(timer) || !timer->function); + spin_lock_irqsave(&base->lock, flags); + timer_set_base(timer, base); + debug_timer_activate(timer); + internal_add_timer(base, timer); + /* + * Check whether the other CPU is idle and needs to be + * triggered to reevaluate the timer wheel when nohz is + * active. We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to idle can not evaluate + * the timer wheel. + */ + wake_up_idle_cpu(cpu); + spin_unlock_irqrestore(&base->lock, flags); +} + +/** * del_timer - deactive a timer. * @timer: the timer to be deactivated * @@ -733,7 +772,6 @@ int del_timer(struct timer_list *timer) return ret; } - EXPORT_SYMBOL(del_timer); #ifdef CONFIG_SMP @@ -767,7 +805,6 @@ out: return ret; } - EXPORT_SYMBOL(try_to_del_timer_sync); /** @@ -796,7 +833,6 @@ int del_timer_sync(struct timer_list *timer) cpu_relax(); } } - EXPORT_SYMBOL(del_timer_sync); #endif @@ -1268,7 +1304,7 @@ signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire); + __mod_timer(&timer, expire, false); schedule(); del_singleshot_timer_sync(&timer); -- cgit v1.1 From 712406a6bf59ebf4a00358bb59a4a2a1b2953d90 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 9 Feb 2009 10:54:03 -0800 Subject: tracing/function-graph-tracer: make arch generic push pop functions There is nothing really arch specific of the push and pop functions used by the function graph tracer. This patch moves them to generic code. Acked-by: Frederic Weisbecker Acked-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 75 ++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 930c08e..dce71a5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -42,6 +42,81 @@ static struct tracer_flags tracer_flags = { /* pid on the last trace processed */ static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 }; +/* Add a function return address to the trace stack on thread info.*/ +int +ftrace_push_return_trace(unsigned long ret, unsigned long long time, + unsigned long func, int *depth) +{ + int index; + + if (!current->ret_stack) + return -EBUSY; + + /* The return trace stack is full */ + if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { + atomic_inc(¤t->trace_overrun); + return -EBUSY; + } + + index = ++current->curr_ret_stack; + barrier(); + current->ret_stack[index].ret = ret; + current->ret_stack[index].func = func; + current->ret_stack[index].calltime = time; + *depth = index; + + return 0; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +void +ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) +{ + int index; + + index = current->curr_ret_stack; + + if (unlikely(index < 0)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic, otherwise we have no where to go */ + *ret = (unsigned long)panic; + return; + } + + *ret = current->ret_stack[index].ret; + trace->func = current->ret_stack[index].func; + trace->calltime = current->ret_stack[index].calltime; + trace->overrun = atomic_read(¤t->trace_overrun); + trace->depth = index; + barrier(); + current->curr_ret_stack--; + +} + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +unsigned long ftrace_return_to_handler(void) +{ + struct ftrace_graph_ret trace; + unsigned long ret; + + ftrace_pop_return_trace(&trace, &ret); + trace.rettime = cpu_clock(raw_smp_processor_id()); + ftrace_graph_return(&trace); + + if (unlikely(!ret)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic. What else to do? */ + ret = (unsigned long)panic; + } + + return ret; +} + static int graph_trace_init(struct trace_array *tr) { int cpu, ret; -- cgit v1.1 From fdcedf7b75808dd72c3cc0b931be11b04d75c60a Mon Sep 17 00:00:00 2001 From: john stultz Date: Wed, 18 Feb 2009 16:02:22 -0800 Subject: time: apply NTP frequency/tick changes immediately Since the GENERIC_TIME changes landed, the adjtimex behavior changed for struct timex.tick and .freq changed. When the tick or freq value is set, we adjust the tick_length_base in ntp_update_frequency(). However, this new value doesn't get applied to tick_length until the next second (via second_overflow). This means some applications that do quick time tweaking do not see the requested change made as quickly as expected. I've run a few tests with this change, and ntpd still functions fine. Signed-off-by: John Stultz Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f5f793d..e1fa368 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -51,6 +51,7 @@ static long ntp_tick_adj; static void ntp_update_frequency(void) { + u64 old_tick_length_base = tick_length_base; u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << NTP_SCALE_SHIFT; second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; @@ -60,6 +61,12 @@ static void ntp_update_frequency(void) tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ); + + /* + * Don't wait for the next second_overflow, apply + * the change to the tick length immediately + */ + tick_length += tick_length_base - old_tick_length_base; } static void ntp_update_offset(long offset) -- cgit v1.1 From 6b588c18f8dacfa6d7957c33c5ff832096e752d3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:07 +0900 Subject: module: reorder module pcpu related functions Impact: cleanup Move percpu_modinit() upwards. This is to ease further changes. Signed-off-by: Tejun Heo --- kernel/module.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ba22484..52b3497 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -480,21 +480,6 @@ static void percpu_modfree(void *freeme) } } -static unsigned int find_pcpusec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) -{ - return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); -} - -static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) -{ - int cpu; - - for_each_possible_cpu(cpu) - memcpy(pcpudest + per_cpu_offset(cpu), from, size); -} - static int percpu_modinit(void) { pcpu_num_used = 2; @@ -513,7 +498,24 @@ static int percpu_modinit(void) return 0; } __initcall(percpu_modinit); + +static unsigned int find_pcpusec(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + const char *secstrings) +{ + return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); +} + +static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) +{ + int cpu; + + for_each_possible_cpu(cpu) + memcpy(pcpudest + per_cpu_offset(cpu), from, size); +} + #else /* ... !CONFIG_SMP */ + static inline void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) { @@ -535,6 +537,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src, /* pcpusec should be 0, and size of that section should be 0. */ BUG_ON(size != 0); } + #endif /* CONFIG_SMP */ #define MODINFO_ATTR(field) \ -- cgit v1.1 From b36128c830a8f5bd7d4981f5b0b69950f5928ee6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: alloc_percpu: change percpu_ptr to per_cpu_ptr Impact: cleanup There are two allocated per-cpu accessor macros with almost identical spelling. The original and far more popular is per_cpu_ptr (44 files), so change over the other 4 files. tj: kill percpu_ptr() and update UP too Signed-off-by: Rusty Russell Cc: mingo@redhat.com Cc: lenb@kernel.org Cc: cpufreq@vger.kernel.org Signed-off-by: Tejun Heo --- kernel/sched.c | 6 +++--- kernel/stop_machine.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index fc17fd9..9d30ac9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9472,7 +9472,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; #ifndef CONFIG_64BIT @@ -9491,7 +9491,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); #ifndef CONFIG_64BIT /* @@ -9587,7 +9587,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) ca = task_ca(tsk); for (; ca; ca = ca->parent) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 0cd415e..74541ca 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) * doesn't hit this CPU until we're ready. */ get_cpu(); for_each_online_cpu(i) { - sm_work = percpu_ptr(stop_machine_work, i); + sm_work = per_cpu_ptr(stop_machine_work, i); INIT_WORK(sm_work, stop_cpu); queue_work_on(i, stop_machine_wq, sm_work); } -- cgit v1.1 From fbf59bc9d74d1fb30b8e0630743aff2806eafcea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: percpu: implement new dynamic percpu allocator Impact: new scalable dynamic percpu allocator which allows dynamic percpu areas to be accessed the same way as static ones Implement scalable dynamic percpu allocator which can be used for both static and dynamic percpu areas. This will allow static and dynamic areas to share faster direct access methods. This feature is optional and enabled only when CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is defined by arch. Please read comment on top of mm/percpu.c for details. Signed-off-by: Tejun Heo Cc: Andrew Morton --- kernel/module.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 52b3497..1f0657a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -51,6 +51,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -366,6 +367,34 @@ static struct module *find_module(const char *name) } #ifdef CONFIG_SMP + +#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + +static void *percpu_modalloc(unsigned long size, unsigned long align, + const char *name) +{ + void *ptr; + + if (align > PAGE_SIZE) { + printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", + name, align, PAGE_SIZE); + align = PAGE_SIZE; + } + + ptr = __alloc_percpu(size, align); + if (!ptr) + printk(KERN_WARNING + "Could not allocate %lu bytes percpu data\n", size); + return ptr; +} + +static void percpu_modfree(void *freeme) +{ + free_percpu(freeme); +} + +#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ + /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; /* Size of each block. -ve means used. */ @@ -499,6 +528,8 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); +#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ + static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, const char *secstrings) -- cgit v1.1 From 53bbfa9e9437e70b322368e82c723112d690e304 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 Feb 2008 07:58:42 +0100 Subject: time: ntp: clean up kernel/time/ntp.c Impact: cleanup, no functionality changed Make this file a bit more readable by applying a consistent coding style. No code changed: kernel/time/ntp.o: text data bss dec hex filename 2552 170 168 2890 b4a ntp.o.before 2552 170 168 2890 b4a ntp.o.after md5: eae1275df0b7d6290c13f6f6f8f05c8c ntp.o.before.asm eae1275df0b7d6290c13f6f6f8f05c8c ntp.o.after.asm Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 129 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 81 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index e1fa368..3479ec4 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -1,53 +1,81 @@ /* - * linux/kernel/time/ntp.c - * * NTP state machine interfaces and logic. * * This code was mainly moved from kernel/timer.c and kernel/time.c * Please see those files for relevant copyright info and historical * changelogs. */ - -#include -#include -#include -#include -#include #include -#include #include #include -#include +#include +#include +#include +#include +#include +#include /* - * Timekeeping variables + * NTP timekeeping variables: */ -unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ -unsigned long tick_nsec; /* ACTHZ period (nsec) */ -u64 tick_length; -static u64 tick_length_base; -static struct hrtimer leap_timer; +/* USER_HZ period (usecs): */ +unsigned long tick_usec = TICK_USEC; + +/* ACTHZ period (nsecs): */ +unsigned long tick_nsec; -#define MAX_TICKADJ 500 /* microsecs */ -#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ - NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) +u64 tick_length; +static u64 tick_length_base; + +static struct hrtimer leap_timer; + +#define MAX_TICKADJ 500 /* usecs */ +#define MAX_TICKADJ_SCALED \ + (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) /* * phase-lock loop variables */ -/* TIME_ERROR prevents overwriting the CMOS clock */ -static int time_state = TIME_OK; /* clock synchronization status */ -int time_status = STA_UNSYNC; /* clock status bits */ -static long time_tai; /* TAI offset (s) */ -static s64 time_offset; /* time adjustment (ns) */ -static long time_constant = 2; /* pll time constant */ -long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ -long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -static s64 time_freq; /* frequency offset (scaled ns/s)*/ -static long time_reftime; /* time at last adjustment (s) */ -long time_adjust; -static long ntp_tick_adj; + +/* + * clock synchronization status + * + * (TIME_ERROR prevents overwriting the CMOS clock) + */ +static int time_state = TIME_OK; + +/* clock status bits: */ +int time_status = STA_UNSYNC; + +/* TAI offset (secs): */ +static long time_tai; + +/* time adjustment (nsecs): */ +static s64 time_offset; + +/* pll time constant: */ +static long time_constant = 2; + +/* maximum error (usecs): */ +long time_maxerror = NTP_PHASE_LIMIT; + +/* estimated error (usecs): */ +long time_esterror = NTP_PHASE_LIMIT; + +/* frequency offset (scaled nsecs/secs): */ +static s64 time_freq; + +/* time at last adjustment (secs): */ +static long time_reftime; + +long time_adjust; + +static long ntp_tick_adj; + +/* + * NTP methods: + */ static void ntp_update_frequency(void) { @@ -118,15 +146,15 @@ static void ntp_update_offset(long offset) */ void ntp_clear(void) { - time_adjust = 0; /* stop active adjtime() */ - time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; + time_adjust = 0; /* stop active adjtime() */ + time_status |= STA_UNSYNC; + time_maxerror = NTP_PHASE_LIMIT; + time_esterror = NTP_PHASE_LIMIT; ntp_update_frequency(); - tick_length = tick_length_base; - time_offset = 0; + tick_length = tick_length_base; + time_offset = 0; } /* @@ -147,8 +175,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) xtime.tv_sec--; wall_to_monotonic.tv_sec++; time_state = TIME_OOP; - printk(KERN_NOTICE "Clock: " - "inserting leap second 23:59:60 UTC\n"); + printk(KERN_NOTICE + "Clock: inserting leap second 23:59:60 UTC\n"); hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); res = HRTIMER_RESTART; break; @@ -157,8 +185,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) time_tai--; wall_to_monotonic.tv_sec--; time_state = TIME_WAIT; - printk(KERN_NOTICE "Clock: " - "deleting leap second 23:59:59 UTC\n"); + printk(KERN_NOTICE + "Clock: deleting leap second 23:59:59 UTC\n"); break; case TIME_OOP: time_tai++; @@ -199,10 +227,10 @@ void second_overflow(void) * Compute the phase adjustment for the next second. The offset is * reduced by a fixed factor times the time constant. */ - tick_length = tick_length_base; - time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); - time_offset -= time_adj; - tick_length += time_adj; + tick_length = tick_length_base; + time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); + time_offset -= time_adj; + tick_length += time_adj; if (unlikely(time_adjust)) { if (time_adjust > MAX_TICKADJ) { @@ -240,12 +268,13 @@ static void sync_cmos_clock(struct work_struct *work) * This code is run on a timer. If the clock is set, that timer * may not expire at the correct time. Thus, we adjust... */ - if (!ntp_synced()) + if (!ntp_synced()) { /* * Not synced, exit, do not restart a timer (if one is * running, let it run out). */ return; + } getnstimeofday(&now); if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) @@ -277,7 +306,8 @@ static void notify_cmos_timer(void) static inline void notify_cmos_timer(void) { } #endif -/* adjtimex mainly allows reading (and writing, if superuser) of +/* + * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ int do_adjtimex(struct timex *txc) @@ -298,7 +328,10 @@ int do_adjtimex(struct timex *txc) if (txc->modes && !capable(CAP_SYS_TIME)) return -EPERM; - /* if the quartz is off by more than 10% something is VERY wrong! */ + /* + * if the quartz is off by more than 10% then + * something is VERY wrong! + */ if (txc->modes & ADJ_TICK && (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) -- cgit v1.1 From 3c972c2444dcb7088999c32b8c5a7ab3b8a6c0b6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 12:06:57 +0100 Subject: time: ntp: simplify the second_overflow() code flow Impact: cleanup, no functionality changed Instead of a hierarchy of conditions, transform them to clean gradual conditions and return's. This makes the flow easier to read and makes the purpose of the function easier to understand. kernel/time/ntp.o: text data bss dec hex filename 2552 170 168 2890 b4a ntp.o.before 2552 170 168 2890 b4a ntp.o.after md5: eae1275df0b7d6290c13f6f6f8f05c8c ntp.o.before.asm eae1275df0b7d6290c13f6f6f8f05c8c ntp.o.after.asm Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 3479ec4..1fa6615 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -232,19 +232,24 @@ void second_overflow(void) time_offset -= time_adj; tick_length += time_adj; - if (unlikely(time_adjust)) { - if (time_adjust > MAX_TICKADJ) { - time_adjust -= MAX_TICKADJ; - tick_length += MAX_TICKADJ_SCALED; - } else if (time_adjust < -MAX_TICKADJ) { - time_adjust += MAX_TICKADJ; - tick_length -= MAX_TICKADJ_SCALED; - } else { - tick_length += (s64)(time_adjust * NSEC_PER_USEC / - NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; - time_adjust = 0; - } + if (!time_adjust) + return; + + if (time_adjust > MAX_TICKADJ) { + time_adjust -= MAX_TICKADJ; + tick_length += MAX_TICKADJ_SCALED; + return; } + + if (time_adjust < -MAX_TICKADJ) { + time_adjust += MAX_TICKADJ; + tick_length -= MAX_TICKADJ_SCALED; + return; + } + + tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) + << NTP_SCALE_SHIFT; + time_adjust = 0; } #ifdef CONFIG_GENERIC_CMOS_UPDATE -- cgit v1.1 From bbd1267690bb6940d0722dd33e929442c0409c01 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 12:11:11 +0100 Subject: time: ntp: simplify the MAX_TICKADJ_SCALED definition Impact: cleanup, no functionality changed There's an ugly u64 typecase in the MAX_TICKADJ_SCALED definition, this can be eliminated by making the MAX_TICKADJ constant's type 64-bit (signed). kernel/time/ntp.o: text data bss dec hex filename 2504 114 136 2754 ac2 ntp.o.before 2504 114 136 2754 ac2 ntp.o.after md5: 41f3009debc9b397d7394dd77d912f0a ntp.o.before.asm 41f3009debc9b397d7394dd77d912f0a ntp.o.after.asm Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 1fa6615..2b758c9 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -30,9 +30,9 @@ static u64 tick_length_base; static struct hrtimer leap_timer; -#define MAX_TICKADJ 500 /* usecs */ +#define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ - (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) + (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) /* * phase-lock loop variables -- cgit v1.1 From 9ce616aaefcb9309cb9c49a36310ebda6061b98b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 12:42:59 +0100 Subject: time: ntp: clean up ntp_update_frequency() Impact: cleanup, no functionality changed Prepare a refactoring of ntp_update_frequency(). kernel/time/ntp.o: text data bss dec hex filename 2504 114 136 2754 ac2 ntp.o.before 2504 114 136 2754 ac2 ntp.o.after md5: 41f3009debc9b397d7394dd77d912f0a ntp.o.before.asm 41f3009debc9b397d7394dd77d912f0a ntp.o.after.asm Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 2b758c9..7d281d9 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -77,24 +77,33 @@ static long ntp_tick_adj; * NTP methods: */ +/* + * Update (tick_length, tick_length_base, tick_nsec), based + * on (tick_usec, ntp_tick_adj, time_freq): + */ static void ntp_update_frequency(void) { - u64 old_tick_length_base = tick_length_base; - u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) - << NTP_SCALE_SHIFT; - second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; - second_length += time_freq; + u64 prev_base; + u64 second_length; + + prev_base = tick_length_base; + + second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) + << NTP_SCALE_SHIFT; + + second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; + second_length += time_freq; - tick_length_base = second_length; + tick_length_base = second_length; - tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; - tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ); + tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; + tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ); /* * Don't wait for the next second_overflow, apply * the change to the tick length immediately */ - tick_length += tick_length_base - old_tick_length_base; + tick_length += tick_length_base - prev_base; } static void ntp_update_offset(long offset) -- cgit v1.1 From bc26c31d446bc9c24cd6f7003777a05fe268ae48 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 12:17:36 +0100 Subject: time: ntp: refactor up ntp_update_frequency() Impact: cleanup, no functionality changed Change ntp_update_frequency() from a hard to follow code flow that uses global variables as temporaries, to a clean input+output flow. Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7d281d9..f1abad7 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -83,10 +83,8 @@ static long ntp_tick_adj; */ static void ntp_update_frequency(void) { - u64 prev_base; u64 second_length; - - prev_base = tick_length_base; + u64 new_base; second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << NTP_SCALE_SHIFT; @@ -94,16 +92,15 @@ static void ntp_update_frequency(void) second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; second_length += time_freq; - tick_length_base = second_length; - tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; - tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ); + new_base = div_u64(second_length, NTP_INTERVAL_FREQ); /* * Don't wait for the next second_overflow, apply - * the change to the tick length immediately + * the change to the tick length immediately: */ - tick_length += tick_length_base - prev_base; + tick_length += new_base - tick_length_base; + tick_length_base = new_base; } static void ntp_update_offset(long offset) -- cgit v1.1 From f939890b6687e05c42361655fb6610fa08f5a601 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 12:57:49 +0100 Subject: time: ntp: refactor and clean up ntp_update_offset() Impact: cleanup, no functionality changed - introduce the ntp_update_offset_fll() helper - clean up the flow and variable naming kernel/time/ntp.o: text data bss dec hex filename 2504 114 136 2754 ac2 ntp.o.before 2504 114 136 2754 ac2 ntp.o.after md5: 01f7b8e1a5472a3056f9e4ae84d46315 ntp.o.before.asm 01f7b8e1a5472a3056f9e4ae84d46315 ntp.o.after.asm Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f1abad7..ee437e1 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -103,10 +103,27 @@ static void ntp_update_frequency(void) tick_length_base = new_base; } +static inline s64 ntp_update_offset_fll(s64 freq_adj, s64 offset64, long secs) +{ + time_status &= ~STA_MODE; + + if (secs < MINSEC) + return freq_adj; + + if (!(time_status & STA_FLL) && (secs <= MAXSEC)) + return freq_adj; + + freq_adj += div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); + time_status |= STA_MODE; + + return freq_adj; +} + static void ntp_update_offset(long offset) { - long mtemp; s64 freq_adj; + s64 offset64; + long secs; if (!(time_status & STA_PLL)) return; @@ -127,22 +144,21 @@ static void ntp_update_offset(long offset) */ if (time_status & STA_FREQHOLD || time_reftime == 0) time_reftime = xtime.tv_sec; - mtemp = xtime.tv_sec - time_reftime; + + secs = xtime.tv_sec - time_reftime; time_reftime = xtime.tv_sec; - freq_adj = (s64)offset * mtemp; - freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant); - time_status &= ~STA_MODE; - if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { - freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL), - mtemp); - time_status |= STA_MODE; - } - freq_adj += time_freq; - freq_adj = min(freq_adj, MAXFREQ_SCALED); - time_freq = max(freq_adj, -MAXFREQ_SCALED); + offset64 = offset; + freq_adj = (offset64 * secs) << + (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); + + freq_adj = ntp_update_offset_fll(freq_adj, offset64, secs); + + freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); + + time_freq = max(freq_adj, -MAXFREQ_SCALED); - time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); + time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); } /** -- cgit v1.1 From 478b7aab1682246a3d1e76e27a0aecb2f0013379 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 13:22:23 +0100 Subject: time: ntp: simplify ntp_update_offset_fll() Impact: cleanup, no functionality changed Change ntp_update_offset_fll() to delta logic instead of absolute value logic. This eliminates 'freq_adj' from the function. Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index ee437e1..5202dde 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -103,20 +103,19 @@ static void ntp_update_frequency(void) tick_length_base = new_base; } -static inline s64 ntp_update_offset_fll(s64 freq_adj, s64 offset64, long secs) +static inline s64 ntp_update_offset_fll(s64 offset64, long secs) { time_status &= ~STA_MODE; if (secs < MINSEC) - return freq_adj; + return 0; if (!(time_status & STA_FLL) && (secs <= MAXSEC)) - return freq_adj; + return 0; - freq_adj += div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); time_status |= STA_MODE; - return freq_adj; + return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); } static void ntp_update_offset(long offset) @@ -152,7 +151,7 @@ static void ntp_update_offset(long offset) freq_adj = (offset64 * secs) << (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); - freq_adj = ntp_update_offset_fll(freq_adj, offset64, secs); + freq_adj += ntp_update_offset_fll(offset64, secs); freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); -- cgit v1.1 From c7986acba211e8285e14c9603fb89e6f4ea0b9f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 13:29:09 +0100 Subject: time: ntp: micro-optimize ntp_update_offset() Impact: cleanup, no functionality changed The time_reftime update in ntp_update_offset() to xtime.tv_sec is a convoluted way of saying that we want to freeze the frequency and want the 'secs' delta to be 0. Also make this branch unlikely. This shaves off 8 bytes from the code size: text data bss dec hex filename 2504 114 136 2754 ac2 ntp.o.before 2496 114 136 2746 aba ntp.o.after Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5202dde..580a350 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -141,10 +141,10 @@ static void ntp_update_offset(long offset) * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ - if (time_status & STA_FREQHOLD || time_reftime == 0) - time_reftime = xtime.tv_sec; - secs = xtime.tv_sec - time_reftime; + if (unlikely(time_status & STA_FREQHOLD || time_reftime == 0)) + secs = 0; + time_reftime = xtime.tv_sec; offset64 = offset; -- cgit v1.1 From 10dd31a7a17254d6ba793305fc590455393e610e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 13:38:40 +0100 Subject: time: ntp: fix bug in ntp_update_offset() & do_adjtimex() Impact: change (fix) the way the NTP PLL seconds offset is initialized/tracked Fix a bug and do a micro-optimization: When PLL is enabled we do not reset time_reftime. If the PLL was off for a long time (for example after bootup), this is arguably the wrong thing to do. We already had a hack for the common boot-time case in ntp_update_offset(), in form of: if (unlikely(time_status & STA_FREQHOLD || time_reftime == 0)) secs = 0; But the update delta should be reset later on too - not just when the PLL is enabled for the first time after bootup. So do it on !STA_PLL -> STA_PLL transitions. This changes behavior, as previously if ntpd was disabled for a long time and we restarted it, we'd run from that last update, with a very large delta. Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 580a350..fc08eb1 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -142,7 +142,7 @@ static void ntp_update_offset(long offset) * and in which mode (PLL or FLL). */ secs = xtime.tv_sec - time_reftime; - if (unlikely(time_status & STA_FREQHOLD || time_reftime == 0)) + if (unlikely(time_status & STA_FREQHOLD)) secs = 0; time_reftime = xtime.tv_sec; @@ -394,6 +394,13 @@ int do_adjtimex(struct timex *txc) } /* only set allowed bits */ time_status &= STA_RONLY; + /* + * If we turn on PLL adjustments then reset the + * reference time to current time. + */ + if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) + time_reftime = xtime.tv_sec; + time_status |= txc->status & ~STA_RONLY; switch (time_state) { -- cgit v1.1 From 80f2257116474ceed5fccab510b4f7245c0f49d7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 15:15:32 +0100 Subject: time: ntp: refactor do_adjtimex() Impact: cleanup, no functionality changed do_adjtimex() is currently a monster function with a maze of branches. Refactor the txc->modes setting aspects of it into two new helper functions: process_adj_status() process_adjtimex_modes() kernel/time/ntp.o: text data bss dec hex filename 2512 114 136 2762 aca ntp.o.before 2512 114 136 2762 aca ntp.o.after Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 182 +++++++++++++++++++++++++++++------------------------- 1 file changed, 99 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index fc08eb1..aded09b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -332,6 +332,102 @@ static void notify_cmos_timer(void) static inline void notify_cmos_timer(void) { } #endif + +/* + * Propagate a new txc->status value into the NTP state: + */ +static inline void process_adj_status(struct timex *txc, struct timespec *ts) +{ + long now; + + if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { + time_state = TIME_OK; + time_status = STA_UNSYNC; + } + /* only set allowed bits */ + time_status &= STA_RONLY; + + /* + * If we turn on PLL adjustments then reset the + * reference time to current time. + */ + if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) + time_reftime = xtime.tv_sec; + + time_status |= txc->status & ~STA_RONLY; + + switch (time_state) { + case TIME_OK: + start_timer: + now = ts->tv_sec; + if (time_status & STA_INS) { + time_state = TIME_INS; + now += 86400 - now % 86400; + hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); + } else if (time_status & STA_DEL) { + time_state = TIME_DEL; + now += 86400 - (now + 1) % 86400; + hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); + } + break; + case TIME_INS: + case TIME_DEL: + time_state = TIME_OK; + goto start_timer; + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + break; + case TIME_OOP: + hrtimer_restart(&leap_timer); + break; + } +} +/* + * Called with the xtime lock held, so we can access and modify + * all the global NTP state: + */ +static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) +{ + if (txc->modes & ADJ_STATUS) + process_adj_status(txc, ts); + + if (txc->modes & ADJ_NANO) + time_status |= STA_NANO; + if (txc->modes & ADJ_MICRO) + time_status &= ~STA_NANO; + + if (txc->modes & ADJ_FREQUENCY) { + time_freq = (s64)txc->freq * PPM_SCALE; + time_freq = min(time_freq, MAXFREQ_SCALED); + time_freq = max(time_freq, -MAXFREQ_SCALED); + } + + if (txc->modes & ADJ_MAXERROR) + time_maxerror = txc->maxerror; + if (txc->modes & ADJ_ESTERROR) + time_esterror = txc->esterror; + + if (txc->modes & ADJ_TIMECONST) { + time_constant = txc->constant; + if (!(time_status & STA_NANO)) + time_constant += 4; + time_constant = min(time_constant, (long)MAXTC); + time_constant = max(time_constant, 0l); + } + + if (txc->modes & ADJ_TAI && txc->constant > 0) + time_tai = txc->constant; + + if (txc->modes & ADJ_OFFSET) + ntp_update_offset(txc->offset); + if (txc->modes & ADJ_TICK) + tick_usec = txc->tick; + + if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) + ntp_update_frequency(); +} + /* * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. @@ -383,90 +479,10 @@ int do_adjtimex(struct timex *txc) txc->offset = save_adjust; goto adj_done; } - if (txc->modes) { - long sec; - - if (txc->modes & ADJ_STATUS) { - if ((time_status & STA_PLL) && - !(txc->status & STA_PLL)) { - time_state = TIME_OK; - time_status = STA_UNSYNC; - } - /* only set allowed bits */ - time_status &= STA_RONLY; - /* - * If we turn on PLL adjustments then reset the - * reference time to current time. - */ - if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) - time_reftime = xtime.tv_sec; - - time_status |= txc->status & ~STA_RONLY; - - switch (time_state) { - case TIME_OK: - start_timer: - sec = ts.tv_sec; - if (time_status & STA_INS) { - time_state = TIME_INS; - sec += 86400 - sec % 86400; - hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS); - } else if (time_status & STA_DEL) { - time_state = TIME_DEL; - sec += 86400 - (sec + 1) % 86400; - hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS); - } - break; - case TIME_INS: - case TIME_DEL: - time_state = TIME_OK; - goto start_timer; - break; - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - break; - case TIME_OOP: - hrtimer_restart(&leap_timer); - break; - } - } - - if (txc->modes & ADJ_NANO) - time_status |= STA_NANO; - if (txc->modes & ADJ_MICRO) - time_status &= ~STA_NANO; - if (txc->modes & ADJ_FREQUENCY) { - time_freq = (s64)txc->freq * PPM_SCALE; - time_freq = min(time_freq, MAXFREQ_SCALED); - time_freq = max(time_freq, -MAXFREQ_SCALED); - } - - if (txc->modes & ADJ_MAXERROR) - time_maxerror = txc->maxerror; - if (txc->modes & ADJ_ESTERROR) - time_esterror = txc->esterror; - - if (txc->modes & ADJ_TIMECONST) { - time_constant = txc->constant; - if (!(time_status & STA_NANO)) - time_constant += 4; - time_constant = min(time_constant, (long)MAXTC); - time_constant = max(time_constant, 0l); - } - - if (txc->modes & ADJ_TAI && txc->constant > 0) - time_tai = txc->constant; - - if (txc->modes & ADJ_OFFSET) - ntp_update_offset(txc->offset); - if (txc->modes & ADJ_TICK) - tick_usec = txc->tick; - - if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) - ntp_update_frequency(); - } + /* If there are input parameters, then process them: */ + if (txc->modes) + process_adjtimex_modes(txc, &ts); txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); -- cgit v1.1 From e96291653b2e4df02f160b574070f6e632868e5e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 15:35:18 +0100 Subject: time: ntp: refactor do_adjtimex() some more Impact: cleanup, no functionality changed Further simplify do_adjtimex(): - introduce the ntp_start_leap_timer() helper function - eliminate the goto adj_done complication Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 61 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index aded09b..4346ed6 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -332,14 +332,33 @@ static void notify_cmos_timer(void) static inline void notify_cmos_timer(void) { } #endif +/* + * Start the leap seconds timer: + */ +static inline void ntp_start_leap_timer(struct timespec *ts) +{ + long now = ts->tv_sec; + + if (time_status & STA_INS) { + time_state = TIME_INS; + now += 86400 - now % 86400; + hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); + + return; + } + + if (time_status & STA_DEL) { + time_state = TIME_DEL; + now += 86400 - (now + 1) % 86400; + hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); + } +} /* * Propagate a new txc->status value into the NTP state: */ static inline void process_adj_status(struct timex *txc, struct timespec *ts) { - long now; - if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; time_status = STA_UNSYNC; @@ -358,22 +377,12 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) switch (time_state) { case TIME_OK: - start_timer: - now = ts->tv_sec; - if (time_status & STA_INS) { - time_state = TIME_INS; - now += 86400 - now % 86400; - hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - } else if (time_status & STA_DEL) { - time_state = TIME_DEL; - now += 86400 - (now + 1) % 86400; - hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - } + ntp_start_leap_timer(ts); break; case TIME_INS: case TIME_DEL: time_state = TIME_OK; - goto start_timer; + ntp_start_leap_timer(ts); case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK; @@ -394,6 +403,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts if (txc->modes & ADJ_NANO) time_status |= STA_NANO; + if (txc->modes & ADJ_MICRO) time_status &= ~STA_NANO; @@ -405,6 +415,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts if (txc->modes & ADJ_MAXERROR) time_maxerror = txc->maxerror; + if (txc->modes & ADJ_ESTERROR) time_esterror = txc->esterror; @@ -421,6 +432,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts if (txc->modes & ADJ_OFFSET) ntp_update_offset(txc->offset); + if (txc->modes & ADJ_TICK) tick_usec = txc->tick; @@ -457,7 +469,7 @@ int do_adjtimex(struct timex *txc) if (txc->modes & ADJ_TICK && (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) - return -EINVAL; + return -EINVAL; if (txc->modes & ADJ_STATUS && time_state != TIME_OK) hrtimer_cancel(&leap_timer); @@ -467,7 +479,6 @@ int do_adjtimex(struct timex *txc) write_seqlock_irq(&xtime_lock); - /* If there are input parameters, then process them */ if (txc->modes & ADJ_ADJTIME) { long save_adjust = time_adjust; @@ -477,19 +488,18 @@ int do_adjtimex(struct timex *txc) ntp_update_frequency(); } txc->offset = save_adjust; - goto adj_done; - } + } else { - /* If there are input parameters, then process them: */ - if (txc->modes) - process_adjtimex_modes(txc, &ts); + /* If there are input parameters, then process them: */ + if (txc->modes) + process_adjtimex_modes(txc, &ts); - txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); - if (!(time_status & STA_NANO)) - txc->offset /= NSEC_PER_USEC; + if (!(time_status & STA_NANO)) + txc->offset /= NSEC_PER_USEC; + } -adj_done: result = time_state; /* mostly `TIME_OK' */ if (time_status & (STA_UNSYNC|STA_CLOCKERR)) result = TIME_ERROR; @@ -514,6 +524,7 @@ adj_done: txc->calcnt = 0; txc->errcnt = 0; txc->stbcnt = 0; + write_sequnlock_irq(&xtime_lock); txc->time.tv_sec = ts.tv_sec; -- cgit v1.1 From 2b9d1496e7835a603c340e8f0dd81f4b74d5f248 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 15:48:43 +0100 Subject: time: ntp: make 64-bit constants more robust Impact: cleanup, no functionality changed - make PPM_SCALE an explicit s64 constant, to remove (s64) casts from usage sites. kernel/time/ntp.o: text data bss dec hex filename 2536 114 136 2786 ae2 ntp.o.before 2536 114 136 2786 ae2 ntp.o.after md5: 40a7728d1188aa18e83e21a81fa7b150 ntp.o.before.asm 40a7728d1188aa18e83e21a81fa7b150 ntp.o.after.asm Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4346ed6..7447d57 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -408,7 +408,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts time_status &= ~STA_NANO; if (txc->modes & ADJ_FREQUENCY) { - time_freq = (s64)txc->freq * PPM_SCALE; + time_freq = txc->freq * PPM_SCALE; time_freq = min(time_freq, MAXFREQ_SCALED); time_freq = max(time_freq, -MAXFREQ_SCALED); } @@ -505,7 +505,7 @@ int do_adjtimex(struct timex *txc) result = TIME_ERROR; txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * - (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); + PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; -- cgit v1.1 From 069569e025706f27f939785f86a94d5d8ce55dce Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 16:03:37 +0100 Subject: time: ntp: simplify ntp_tick_adj calculations Impact: micro-optimization Convert the (internal) ntp_tick_adj value we store from unscaled units to scaled units. This is a constant that we never modify, so scaling it up once during bootup is enough - we dont have to do it for every adjustment step. Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7447d57..a3fe7ef 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -71,7 +71,8 @@ static long time_reftime; long time_adjust; -static long ntp_tick_adj; +/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ +static s64 ntp_tick_adj; /* * NTP methods: @@ -89,7 +90,7 @@ static void ntp_update_frequency(void) second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << NTP_SCALE_SHIFT; - second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; + second_length += ntp_tick_adj; second_length += time_freq; tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; @@ -540,6 +541,8 @@ int do_adjtimex(struct timex *txc) static int __init ntp_tick_adj_setup(char *str) { ntp_tick_adj = simple_strtol(str, NULL, 0); + ntp_tick_adj <<= NTP_SCALE_SHIFT; + return 1; } -- cgit v1.1 From 39854fe8c165872d743f6a0c4860ca2de8e45ac9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 16:06:58 +0100 Subject: time: ntp: clean up second_overflow() Impact: cleanup, no functionality changed The 'time_adj' local variable is named in a very confusing way because it almost shadows the 'time_adjust' global variable - which is used in this same function. Rename it to 'delta' - to make them stand apart more clearly. kernel/time/ntp.o: text data bss dec hex filename 2545 114 144 2803 af3 ntp.o.before 2545 114 144 2803 af3 ntp.o.after md5: 1bf0b3be564512279ba7cee299d1d2be ntp.o.before.asm 1bf0b3be564512279ba7cee299d1d2be ntp.o.after.asm Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index a3fe7ef..c74eb7d 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -236,7 +236,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) */ void second_overflow(void) { - s64 time_adj; + s64 delta; /* Bump the maxerror field */ time_maxerror += MAXFREQ / NSEC_PER_USEC; @@ -249,10 +249,11 @@ void second_overflow(void) * Compute the phase adjustment for the next second. The offset is * reduced by a fixed factor times the time constant. */ - tick_length = tick_length_base; - time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); - time_offset -= time_adj; - tick_length += time_adj; + tick_length = tick_length_base; + + delta = shift_right(time_offset, SHIFT_PLL + time_constant); + time_offset -= delta; + tick_length += delta; if (!time_adjust) return; -- cgit v1.1 From c40c6f85a7594ad842233885386a0ca4cd40eafe Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 26 Feb 2009 15:40:15 +0800 Subject: cpuacct: add a branch prediction cpuacct_charge() is in fast-path, and checking of !cpuacct_susys.active always returns false after cpuacct has been initialized at system boot. Signed-off-by: Li Zefan Cc: Peter Zijlstra Cc: Paul Menage Cc: Balbir Singh Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5475d56..8e63ffb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9684,7 +9684,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) struct cpuacct *ca; int cpu; - if (!cpuacct_subsys.active) + if (unlikely(!cpuacct_subsys.active)) return; cpu = task_cpu(tsk); -- cgit v1.1 From a2a5ac8650b570bea3cb3614f77739dcd07d6632 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 26 Feb 2009 09:46:14 -0800 Subject: time: ntp: fix bug in ntp_update_offset() & do_adjtimex(), fix The time_status conditional was accidentally placed right after we clear the checked time_status bits, which causes us to take the conditional every time through. This fixes it by moving the conditional to before we clear the time_status bits. Signed-off-by: John Stultz Cc: Clark Williams Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c74eb7d..7fc6437 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -365,8 +365,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) time_state = TIME_OK; time_status = STA_UNSYNC; } - /* only set allowed bits */ - time_status &= STA_RONLY; /* * If we turn on PLL adjustments then reset the @@ -375,6 +373,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) time_reftime = xtime.tv_sec; + /* only set allowed bits */ + time_status &= STA_RONLY; time_status |= txc->status & ~STA_RONLY; switch (time_state) { -- cgit v1.1 From b342501cd31e5546d0c9ca8ceff5ded1832f9e5b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 26 Feb 2009 20:20:29 +0100 Subject: sched: allow architectures to specify sched_clock_stable Allow CONFIG_HAVE_UNSTABLE_SCHED_CLOCK architectures to still specify that their sched_clock() implementation is reliable. This will be used by x86 to switch on a faster sched_clock_cpu() implementation on certain CPU types. Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index a0b0852..a755d02 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -24,11 +24,11 @@ * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat * consistent between cpus (never more than 2 jiffies difference). */ -#include -#include #include -#include #include +#include +#include +#include /* * Scheduler clock - returns current time in nanosec units. @@ -43,6 +43,10 @@ unsigned long long __attribute__((weak)) sched_clock(void) static __read_mostly int sched_clock_running; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +__read_mostly int sched_clock_stable; +#else +static const int sched_clock_stable = 1; +#endif struct sched_clock_data { /* @@ -87,7 +91,7 @@ void sched_clock_init(void) } /* - * min,max except they take wrapping into account + * min, max except they take wrapping into account */ static inline u64 wrap_min(u64 x, u64 y) @@ -116,10 +120,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) if (unlikely(delta < 0)) delta = 0; + if (unlikely(!sched_clock_running)) + return 0ull; + /* * scd->clock = clamp(scd->tick_gtod + delta, - * max(scd->tick_gtod, scd->clock), - * scd->tick_gtod + TICK_NSEC); + * max(scd->tick_gtod, scd->clock), + * scd->tick_gtod + TICK_NSEC); */ clock = scd->tick_gtod + delta; @@ -148,12 +155,13 @@ static void lock_double_clock(struct sched_clock_data *data1, u64 sched_clock_cpu(int cpu) { - struct sched_clock_data *scd = cpu_sdc(cpu); u64 now, clock, this_clock, remote_clock; + struct sched_clock_data *scd; - if (unlikely(!sched_clock_running)) - return 0ull; + if (sched_clock_stable) + return sched_clock(); + scd = cpu_sdc(cpu); WARN_ON_ONCE(!irqs_disabled()); now = sched_clock(); @@ -193,6 +201,8 @@ u64 sched_clock_cpu(int cpu) return clock; } +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + void sched_clock_tick(void) { struct sched_clock_data *scd = this_scd(); @@ -235,22 +245,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ - -void sched_clock_init(void) -{ - sched_clock_running = 1; -} - -u64 sched_clock_cpu(int cpu) -{ - if (unlikely(!sched_clock_running)) - return 0; - - return sched_clock(); -} - -#endif +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ unsigned long long cpu_clock(int cpu) { -- cgit v1.1 From 8325d9c09dedf45476f4d6261d1b6a72e4a7453f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Feb 2009 21:40:16 +0100 Subject: sched_clock: cleanups - remove superfluous checks in __update_sched_clock() - skip sched_clock_tick() for sched_clock_stable - reinstate the simple !HAVE_UNSTABLE_SCHED_CLOCK code to please the bloatwatch Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index a755d02..390f332 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -44,9 +44,6 @@ static __read_mostly int sched_clock_running; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK __read_mostly int sched_clock_stable; -#else -static const int sched_clock_stable = 1; -#endif struct sched_clock_data { /* @@ -115,14 +112,9 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) s64 delta = now - scd->tick_raw; u64 clock, min_clock, max_clock; - WARN_ON_ONCE(!irqs_disabled()); - if (unlikely(delta < 0)) delta = 0; - if (unlikely(!sched_clock_running)) - return 0ull; - /* * scd->clock = clamp(scd->tick_gtod + delta, * max(scd->tick_gtod, scd->clock), @@ -201,18 +193,20 @@ u64 sched_clock_cpu(int cpu) return clock; } -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - void sched_clock_tick(void) { - struct sched_clock_data *scd = this_scd(); + struct sched_clock_data *scd; u64 now, now_gtod; + if (sched_clock_stable) + return; + if (unlikely(!sched_clock_running)) return; WARN_ON_ONCE(!irqs_disabled()); + scd = this_scd(); now_gtod = ktime_to_ns(ktime_get()); now = sched_clock(); @@ -245,6 +239,21 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); +#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ + +void sched_clock_init(void) +{ + sched_clock_running = 1; +} + +u64 sched_clock_cpu(int cpu) +{ + if (unlikely(!sched_clock_running)) + return 0; + + return sched_clock(); +} + #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ unsigned long long cpu_clock(int cpu) -- cgit v1.1 From 1d1e97562e5e2ac60fb7b25437ba619f95f67fab Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Thu, 26 Feb 2009 18:27:38 -0600 Subject: keys: distinguish per-uid keys in different namespaces per-uid keys were looked by uid only. Use the user namespace to distinguish the same uid in different namespaces. This does not address key_permission. So a task can for instance try to join a keyring owned by the same uid in another namespace. That will be handled by a separate patch. Signed-off-by: Serge E. Hallyn Acked-by: David Howells Signed-off-by: James Morris --- kernel/user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 477b666..d8b332c 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -20,7 +20,7 @@ struct user_namespace init_user_ns = { .kref = { - .refcount = ATOMIC_INIT(1), + .refcount = ATOMIC_INIT(2), }, .creator = &root_user, }; -- cgit v1.1 From b67802ea8061393f7bd2d4db934646e76096027c Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Mon, 2 Mar 2009 13:55:26 +0800 Subject: sched: kill unused parameter of pick_next_task() Impact: micro-optimization Parameter "prev" is not used really. Signed-off-by: Wang Chen Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index dfae1bf..9fe8e17 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4603,7 +4603,7 @@ static inline void schedule_debug(struct task_struct *prev) * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev) +pick_next_task(struct rq *rq) { const struct sched_class *class; struct task_struct *p; @@ -4678,7 +4678,7 @@ need_resched_nonpreemptible: idle_balance(cpu, rq); prev->sched_class->put_prev_task(rq, prev); - next = pick_next_task(rq, prev); + next = pick_next_task(rq); if (likely(prev != next)) { sched_info_switch(prev, next); @@ -6514,7 +6514,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu) if (!rq->nr_running) break; update_rq_clock(rq); - next = pick_next_task(rq, rq->curr); + next = pick_next_task(rq); if (!next) break; next->sched_class->put_prev_task(rq, next); -- cgit v1.1 From 044d408409cc4e1bc75c886e27ca85c270db104c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Mar 2009 16:13:32 +0100 Subject: genirq: assert that irq handlers are indeed running in hardirq context Make sure the genirq layer handlers are indeed running handlers in hardirq context. That is the genirq expectation and doing anything else is broken. Signed-off-by: Peter Zijlstra Cc: Andrew Morton LKML-Reference: <1236006812.5330.632.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3aba8d1..a2ee682 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -328,6 +328,8 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; + WARN_ONCE(!in_irq(), "BUG: IRQ handler called from non-hardirq context!"); + if (!(action->flags & IRQF_DISABLED)) local_irq_enable_in_hardirq(); -- cgit v1.1 From 8a0be9ef8225638d26b455788f988c8f84ce9e75 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 5 Mar 2009 01:27:02 +0100 Subject: sched: don't rebalance if attached on NULL domain Impact: fix function graph trace hang / drop pointless softirq on UP While debugging a function graph trace hang on an old PII, I saw that it consumed most of its time on the timer interrupt. And the domain rebalancing softirq was the most concerned. The timer interrupt calls trigger_load_balance() which will decide if it is worth to schedule a rebalancing softirq. In case of builtin UP kernel, no problem arises because there is no domain question. In case of builtin SMP kernel running on an SMP box, still no problem, the softirq will be raised each time we reach the next_balance time. In case of builtin SMP kernel running on a UP box (most distros provide default SMP kernels, whatever the box you have), then the CPU is attached to the NULL sched domain. So a kind of unexpected behaviour happen: trigger_load_balance() -> raises the rebalancing softirq later on softirq: run_rebalance_domains() -> rebalance_domains() where the for_each_domain(cpu, sd) is not taken because of the NULL domain we are attached at. Which means rq->next_balance is never updated. So on the next timer tick, we will enter trigger_load_balance() which will always reschedule() the rebalacing softirq: if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); So for each tick, we process this pointless softirq. This patch fixes it by checking if we are attached to the null domain before raising the softirq, another possible fix would be to set the maximal possible JIFFIES value to rq->next_balance if we are attached to the NULL domain. v2: build fix on UP Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Peter Zijlstra LKML-Reference: <49af242d.1c07d00a.32d5.ffffc019@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index dfae1bf..e509dbd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4148,6 +4148,11 @@ static void run_rebalance_domains(struct softirq_action *h) #endif } +static inline int on_null_domain(int cpu) +{ + return !rcu_dereference(cpu_rq(cpu)->sd); +} + /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. * @@ -4205,7 +4210,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) cpumask_test_cpu(cpu, nohz.cpu_mask)) return; #endif - if (time_after_eq(jiffies, rq->next_balance)) + /* Don't need to rebalance while attached to NULL domain */ + if (time_after_eq(jiffies, rq->next_balance) && + likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); } -- cgit v1.1 From edcb463997ed7b2ffa3bac76e3e75957318f2e01 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: percpu, module: implement reserved allocation and use it for module percpu variables Impact: add reserved allocation functionality and use it for module percpu variables This patch implements reserved allocation from the first chunk. When setting up the first chunk, arch can ask to set aside certain number of bytes right after the core static area which is available only through a separate reserved allocator. This will be used primarily for module static percpu variables on architectures with limited relocation range to ensure that the module perpcu symbols are inside the relocatable range. If reserved area is requested, the first chunk becomes reserved and isn't available for regular allocation. If the first chunk also includes piggy-back dynamic allocation area, a separate chunk mapping the same region is created to serve dynamic allocation. The first one is called static first chunk and the second dynamic first chunk. Although they share the page map, their different area map initializations guarantee they serve disjoint areas according to their purposes. If arch doesn't setup reserved area, reserved allocation is handled like any other allocation. Signed-off-by: Tejun Heo --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 1f0657a..f0e04d6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -381,7 +381,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align, align = PAGE_SIZE; } - ptr = __alloc_percpu(size, align); + ptr = __alloc_reserved_percpu(size, align); if (!ptr) printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n", size); -- cgit v1.1 From 5ed0cec0ac5f1b3759bdbe4d9df32ee4ff8afb5a Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 6 Mar 2009 19:40:20 +0800 Subject: sched: TIF_NEED_RESCHED -> need_reshed() cleanup Impact: cleanup Use test_tsk_need_resched(), set_tsk_need_resched(), need_resched() instead of using TIF_NEED_RESCHED. Signed-off-by: Lai Jiangshan Cc: Peter Zijlstra LKML-Reference: <49B10BA4.9070209@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8b92f40..e0fa739 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1189,10 +1189,10 @@ static void resched_task(struct task_struct *p) assert_spin_locked(&task_rq(p)->lock); - if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + if (test_tsk_need_resched(p)) return; - set_tsk_thread_flag(p, TIF_NEED_RESCHED); + set_tsk_need_resched(p); cpu = task_cpu(p); if (cpu == smp_processor_id()) @@ -1248,7 +1248,7 @@ void wake_up_idle_cpu(int cpu) * lockless. The worst case is that the other CPU runs the * idle task through an additional NOOP schedule() */ - set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); + set_tsk_need_resched(rq->idle); /* NEED_RESCHED must be visible before we test polling */ smp_mb(); @@ -4740,7 +4740,7 @@ asmlinkage void __sched preempt_schedule(void) * between schedule and now. */ barrier(); - } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); + } while (need_resched()); } EXPORT_SYMBOL(preempt_schedule); @@ -4769,7 +4769,7 @@ asmlinkage void __sched preempt_schedule_irq(void) * between schedule and now. */ barrier(); - } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); + } while (need_resched()); } #endif /* CONFIG_PREEMPT */ -- cgit v1.1 From 57310a98a354e84279d7c8af2f48805a62372e53 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Mar 2009 13:56:21 +0100 Subject: sched: optimize ttwu vs group scheduling Impact: micro-optimization We can avoid the sched domain walk on try_to_wake_up() when we know there are no groups. Signed-off-by: Peter Zijlstra LKML-Reference: <1236603381.8389.455.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e0fa739..af5cd1b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -331,6 +331,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; */ static DEFINE_SPINLOCK(task_group_lock); +#ifdef CONFIG_SMP +static int root_task_group_empty(void) +{ + return list_empty(&root_task_group.children); +} +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) @@ -391,6 +398,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #else +#ifdef CONFIG_SMP +static int root_task_group_empty(void) +{ + return 1; +} +#endif + static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } static inline struct task_group *task_group(struct task_struct *p) { @@ -2318,7 +2332,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) sync = 0; #ifdef CONFIG_SMP - if (sched_feat(LB_WAKEUP_UPDATE)) { + if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { struct sched_domain *sd; this_cpu = raw_smp_processor_id(); -- cgit v1.1 From df1c99d416500da8d26a4d78777467c53ee7689e Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 10 Mar 2009 19:08:11 +0100 Subject: sched: add avg_overlap decay Impact: more precise avg_overlap metric - better load-balancing avg_overlap is used to measure the runtime overlap of the waker and wakee. However, when a process changes behaviour, eg a pipe becomes un-congested and we don't need to go to sleep after a wakeup for a while, the avg_overlap value grows stale. When running we use the avg runtime between preemption as a measure for avg_overlap since the amount of runtime can be correlated to cache footprint. The longer we run, the less likely we'll be wanting to be migrated to another CPU. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1236709131.25234.576.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index af5cd1b..2f28351 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4620,6 +4620,28 @@ static inline void schedule_debug(struct task_struct *prev) #endif } +static void put_prev_task(struct rq *rq, struct task_struct *prev) +{ + if (prev->state == TASK_RUNNING) { + u64 runtime = prev->se.sum_exec_runtime; + + runtime -= prev->se.prev_sum_exec_runtime; + runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + + /* + * In order to avoid avg_overlap growing stale when we are + * indeed overlapping and hence not getting put to sleep, grow + * the avg_overlap on preemption. + * + * We use the average preemption runtime because that + * correlates to the amount of cache footprint a task can + * build up. + */ + update_avg(&prev->se.avg_overlap, runtime); + } + prev->sched_class->put_prev_task(rq, prev); +} + /* * Pick up the highest-prio task: */ @@ -4698,7 +4720,7 @@ need_resched_nonpreemptible: if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); - prev->sched_class->put_prev_task(rq, prev); + put_prev_task(rq, prev); next = pick_next_task(rq); if (likely(prev != next)) { -- cgit v1.1 From b2d0994b1301fc3a6a89e1889578dac9227840e3 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 12 Mar 2009 00:55:37 -0700 Subject: futex: update futex commentary Impact: cleanup The futex_hash_bucket can be a bit confusing when first looking at the code as it is a shared queue (and futex_q isn't a queue at all, but rather an element on the queue). The mmap_sem is no longer held outside of the futex_handle_fault() routine, yet numerous comments refer to it. The fshared argument is no an integer. I left some of these comments along as they are simply removed in future patches. Some of the commentary refering to futexes by virtual page mappings was not very clear, and completely accurate (as for shared futexes both the page and the offset are used to determine the key). For the purposes of the function description, just referring to "the futex" seems sufficient. With hashed futexes we now access the page after the hash-bucket is locked, and not only after it is enqueued. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Rusty Russell LKML-Reference: <20090312075537.9856.29954.stgit@Aeon> Signed-off-by: Ingo Molnar --- kernel/futex.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 438701a..e6a4d72 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -114,7 +114,9 @@ struct futex_q { }; /* - * Split the global futex_lock into every hash list lock. + * Hash buckets are shared by all the futex_keys that hash to the same + * location. Each key may have multiple futex_q structures, one for each task + * waiting on a futex. */ struct futex_hash_bucket { spinlock_t lock; @@ -189,8 +191,7 @@ static void drop_futex_key_refs(union futex_key *key) /** * get_futex_key - Get parameters which are the keys for a futex. * @uaddr: virtual address of the futex - * @shared: NULL for a PROCESS_PRIVATE futex, - * ¤t->mm->mmap_sem for a PROCESS_SHARED futex + * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED * @key: address where result is stored. * * Returns a negative error code or 0 @@ -200,9 +201,7 @@ static void drop_futex_key_refs(union futex_key *key) * offset_within_page). For private mappings, it's (uaddr, current->mm). * We can usually work out the index without swapping in the page. * - * fshared is NULL for PROCESS_PRIVATE futexes - * For other futexes, it points to ¤t->mm->mmap_sem and - * caller must have taken the reader lock. but NOT any spinlocks. + * lock_page() might sleep, the caller should not hold a spinlock. */ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { @@ -589,10 +588,9 @@ static void wake_futex(struct futex_q *q) * The waiting task can free the futex_q as soon as this is written, * without taking any locks. This must come last. * - * A memory barrier is required here to prevent the following store - * to lock_ptr from getting ahead of the wakeup. Clearing the lock - * at the end of wake_up_all() does not prevent this store from - * moving. + * A memory barrier is required here to prevent the following store to + * lock_ptr from getting ahead of the wakeup. Clearing the lock at the + * end of wake_up() does not prevent this store from moving. */ smp_wmb(); q->lock_ptr = NULL; @@ -693,8 +691,7 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) } /* - * Wake up all waiters hashed on the physical page that is mapped - * to this virtual address: + * Wake up waiters matching bitset queued on this futex (uaddr). */ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) { @@ -1076,11 +1073,9 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * in the user space variable. This must be atomic as we have * to preserve the owner died bit here. * - * Note: We write the user space value _before_ changing the - * pi_state because we can fault here. Imagine swapped out - * pages or a fork, which was running right before we acquired - * mmap_sem, that marked all the anonymous memory readonly for - * cow. + * Note: We write the user space value _before_ changing the pi_state + * because we can fault here. Imagine swapped out pages or a fork + * that marked all the anonymous memory readonly for cow. * * Modifying pi_state _before_ the user space value would * leave the pi_state in an inconsistent state when we fault @@ -1188,7 +1183,7 @@ retry: hb = queue_lock(&q); /* - * Access the page AFTER the futex is queued. + * Access the page AFTER the hash-bucket is locked. * Order is important: * * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); @@ -1204,7 +1199,7 @@ retry: * a wakeup when *uaddr != val on entry to the syscall. This is * rare, but normal. * - * for shared futexes, we hold the mmap semaphore, so the mapping + * For shared futexes, we hold the mmap semaphore, so the mapping * cannot have changed since we looked it up in get_futex_key. */ ret = get_futex_value_locked(&uval, uaddr); -- cgit v1.1 From de87fcc124a5d4a171aa32707b3265608ebda6e7 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 12 Mar 2009 00:55:46 -0700 Subject: futex: additional (get|put)_futex_key() fixes Impact: fix races futex_requeue and futex_lock_pi still had some bad (get|put)_futex_key() usage. This patch adds the missing put_futex_keys() and corrects a goto in futex_lock_pi() to avoid a double get. Build and boot tested on a 4 way Intel x86_64 workstation. Passes basic pthread_mutex and PI tests out of ltp/testcases/realtime. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Rusty Russell LKML-Reference: <20090312075545.9856.75152.stgit@Aeon> Signed-off-by: Ingo Molnar --- kernel/futex.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e6a4d72..4000454 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -802,8 +802,10 @@ retry: ret = get_user(dummy, uaddr2); if (ret) - return ret; + goto out_put_keys; + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); goto retryfull; } @@ -878,6 +880,9 @@ retry: if (hb1 != hb2) spin_unlock(&hb2->lock); + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + ret = get_user(curval, uaddr1); if (!ret) @@ -1453,6 +1458,7 @@ retry_locked: * exit to complete. */ queue_unlock(&q, hb); + put_futex_key(fshared, &q.key); cond_resched(); goto retry; @@ -1595,13 +1601,12 @@ uaddr_faulted: ret = get_user(uval, uaddr); if (!ret) - goto retry; + goto retry_unlocked; - if (to) - destroy_hrtimer_on_stack(&to->timer); - return ret; + goto out_put_key; } + /* * Userspace attempted a TID -> 0 atomic transition, and failed. * This is the in-kernel slowpath: we look up the PI state (if any), @@ -1705,6 +1710,7 @@ pi_faulted: } ret = get_user(uval, uaddr); + put_futex_key(fshared, &key); if (!ret) goto retry; -- cgit v1.1 From 5eb3dc62fc5986e85715041c23dcf3832812be4b Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 12 Mar 2009 00:55:52 -0700 Subject: futex: add double_unlock_hb() Impact: cleanup The futex code uses double_lock_hb() which locks the hb->lock's in pointer value order. There is no parallel unlock routine, and the code unlocks them in name order, ignoring pointer value. This patch adds double_unlock_hb() to refactor the duplicated code segments. Build and boot tested on a 4 way Intel x86_64 workstation. Passes basic pthread_mutex and PI tests out of ltp/testcases/realtime. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Rusty Russell LKML-Reference: <20090312075552.9856.48021.stgit@Aeon> Signed-off-by: Ingo Molnar --- kernel/futex.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 4000454..e149545 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -690,6 +690,19 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) } } +static inline void +double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) +{ + if (hb1 <= hb2) { + spin_unlock(&hb2->lock); + if (hb1 < hb2) + spin_unlock(&hb1->lock); + } else { /* hb1 > hb2 */ + spin_unlock(&hb1->lock); + spin_unlock(&hb2->lock); + } +} + /* * Wake up waiters matching bitset queued on this futex (uaddr). */ @@ -767,9 +780,7 @@ retry: if (unlikely(op_ret < 0)) { u32 dummy; - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); + double_unlock_hb(hb1, hb2); #ifndef CONFIG_MMU /* @@ -833,9 +844,7 @@ retry: ret += op_ret; } - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); + double_unlock_hb(hb1, hb2); out_put_keys: put_futex_key(fshared, &key2); out_put_key1: @@ -876,9 +885,7 @@ retry: ret = get_futex_value_locked(&curval, uaddr1); if (unlikely(ret)) { - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); + double_unlock_hb(hb1, hb2); put_futex_key(fshared, &key2); put_futex_key(fshared, &key1); @@ -925,9 +932,7 @@ retry: } out_unlock: - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); + double_unlock_hb(hb1, hb2); /* drop_futex_key_refs() must be called outside the spinlocks. */ while (--drop_count >= 0) -- cgit v1.1 From 16f4993f4e9860715918efd4eeac928f8de1218b Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 12 Mar 2009 00:55:59 -0700 Subject: futex: use current->time_slack_ns for rt tasks too RT tasks should set their timer slack to 0 on their own. This patch removes the 'if (rt_task()) slack = 0;' block in futex_wait. Build and boot tested on a 4 way Intel x86_64 workstation. Passes basic pthread_mutex and PI tests out of ltp/testcases/realtime. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Rusty Russell Cc: Arjan van de Ven LKML-Reference: <20090312075559.9856.28822.stgit@Aeon> Signed-off-by: Ingo Molnar --- kernel/futex.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e149545..6579912 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1253,16 +1253,13 @@ retry: if (!abs_time) schedule(); else { - unsigned long slack; - slack = current->timer_slack_ns; - if (rt_task(current)) - slack = 0; hrtimer_init_on_stack(&t.timer, clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_init_sleeper(&t, current); - hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); + hrtimer_set_expires_range_ns(&t.timer, *abs_time, + current->timer_slack_ns); hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); if (!hrtimer_active(&t.timer)) -- cgit v1.1 From e8f6386c01a5699c115bdad10271a24076364c97 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 12 Mar 2009 00:56:06 -0700 Subject: futex: unlock before returning -EFAULT Impact: rt-mutex failure case fix futex_lock_pi can potentially return -EFAULT with the rt_mutex held. This seems like the wrong thing to do as userspace should assume -EFAULT means the lock was not taken. Even if it could figure this out, we'd be leaving the pi_state->owner in an inconsistent state. This patch unlocks the rt_mutex prior to returning -EFAULT to userspace. Build and boot tested on a 4 way Intel x86_64 workstation. Passes basic pthread_mutex and PI tests out of ltp/testcases/realtime. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Rusty Russell LKML-Reference: <20090312075606.9856.88729.stgit@Aeon> Signed-off-by: Ingo Molnar --- kernel/futex.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6579912..c980a55 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1567,6 +1567,13 @@ retry_locked: } } + /* + * If fixup_pi_state_owner() faulted and was unable to handle the + * fault, unlock it and return the fault to userspace. + */ + if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) + rt_mutex_unlock(&q.pi_state->pi_mutex); + /* Unqueue and drop the lock */ unqueue_me_pi(&q); -- cgit v1.1 From e4dc5b7a36a49eff97050894cf1b3a9a02523717 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 12 Mar 2009 00:56:13 -0700 Subject: futex: clean up fault logic Impact: cleanup Older versions of the futex code held the mmap_sem which had to be dropped in order to call get_user(), so a two-pronged fault handling mechanism was employed to handle faults of the atomic operations. The mmap_sem is no longer held, so get_user() should be adequate. This patch greatly simplifies the logic and improves legibility. Build and boot tested on a 4 way Intel x86_64 workstation. Passes basic pthread_mutex and PI tests out of ltp/testcases/realtime. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Rusty Russell LKML-Reference: <20090312075612.9856.48612.stgit@Aeon> Signed-off-by: Ingo Molnar --- kernel/futex.c | 126 +++++++++++++++++---------------------------------------- 1 file changed, 36 insertions(+), 90 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index c980a55..9c97f67 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -298,41 +298,6 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from) return ret ? -EFAULT : 0; } -/* - * Fault handling. - */ -static int futex_handle_fault(unsigned long address, int attempt) -{ - struct vm_area_struct * vma; - struct mm_struct *mm = current->mm; - int ret = -EFAULT; - - if (attempt > 2) - return ret; - - down_read(&mm->mmap_sem); - vma = find_vma(mm, address); - if (vma && address >= vma->vm_start && - (vma->vm_flags & VM_WRITE)) { - int fault; - fault = handle_mm_fault(mm, vma, address, 1); - if (unlikely((fault & VM_FAULT_ERROR))) { -#if 0 - /* XXX: let's do this when we verify it is OK */ - if (ret & VM_FAULT_OOM) - ret = -ENOMEM; -#endif - } else { - ret = 0; - if (fault & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; - } - } - up_read(&mm->mmap_sem); - return ret; -} /* * PI code: @@ -760,9 +725,9 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, struct futex_hash_bucket *hb1, *hb2; struct plist_head *head; struct futex_q *this, *next; - int ret, op_ret, attempt = 0; + int ret, op_ret; -retryfull: +retry: ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; @@ -773,9 +738,8 @@ retryfull: hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); -retry: double_lock_hb(hb1, hb2); - +retry_private: op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { u32 dummy; @@ -796,28 +760,16 @@ retry: goto out_put_keys; } - /* - * futex_atomic_op_inuser needs to both read and write - * *(int __user *)uaddr2, but we can't modify it - * non-atomically. Therefore, if get_user below is not - * enough, we need to handle the fault ourselves, while - * still holding the mmap_sem. - */ - if (attempt++) { - ret = futex_handle_fault((unsigned long)uaddr2, - attempt); - if (ret) - goto out_put_keys; - goto retry; - } - ret = get_user(dummy, uaddr2); if (ret) goto out_put_keys; + if (!fshared) + goto retry_private; + put_futex_key(fshared, &key2); put_futex_key(fshared, &key1); - goto retryfull; + goto retry; } head = &hb1->chain; @@ -877,6 +829,7 @@ retry: hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); +retry_private: double_lock_hb(hb1, hb2); if (likely(cmpval != NULL)) { @@ -887,15 +840,16 @@ retry: if (unlikely(ret)) { double_unlock_hb(hb1, hb2); - put_futex_key(fshared, &key2); - put_futex_key(fshared, &key1); - ret = get_user(curval, uaddr1); + if (ret) + goto out_put_keys; - if (!ret) - goto retry; + if (!fshared) + goto retry_private; - goto out_put_keys; + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + goto retry; } if (curval != *cmpval) { ret = -EAGAIN; @@ -1070,7 +1024,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, struct futex_pi_state *pi_state = q->pi_state; struct task_struct *oldowner = pi_state->owner; u32 uval, curval, newval; - int ret, attempt = 0; + int ret; /* Owner died? */ if (!pi_state->owner) @@ -1141,7 +1095,7 @@ retry: handle_fault: spin_unlock(q->lock_ptr); - ret = futex_handle_fault((unsigned long)uaddr, attempt++); + ret = get_user(uval, uaddr); spin_lock(q->lock_ptr); @@ -1190,6 +1144,7 @@ retry: if (unlikely(ret != 0)) goto out; +retry_private: hb = queue_lock(&q); /* @@ -1216,13 +1171,16 @@ retry: if (unlikely(ret)) { queue_unlock(&q, hb); - put_futex_key(fshared, &q.key); ret = get_user(uval, uaddr); + if (ret) + goto out_put_key; - if (!ret) - goto retry; - goto out; + if (!fshared) + goto retry_private; + + put_futex_key(fshared, &q.key); + goto retry; } ret = -EWOULDBLOCK; if (unlikely(uval != val)) { @@ -1356,7 +1314,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, struct futex_hash_bucket *hb; u32 uval, newval, curval; struct futex_q q; - int ret, lock_taken, ownerdied = 0, attempt = 0; + int ret, lock_taken, ownerdied = 0; if (refill_pi_state_cache()) return -ENOMEM; @@ -1376,7 +1334,7 @@ retry: if (unlikely(ret != 0)) goto out; -retry_unlocked: +retry_private: hb = queue_lock(&q); retry_locked: @@ -1601,18 +1559,15 @@ uaddr_faulted: */ queue_unlock(&q, hb); - if (attempt++) { - ret = futex_handle_fault((unsigned long)uaddr, attempt); - if (ret) - goto out_put_key; - goto retry_unlocked; - } - ret = get_user(uval, uaddr); - if (!ret) - goto retry_unlocked; + if (ret) + goto out_put_key; - goto out_put_key; + if (!fshared) + goto retry_private; + + put_futex_key(fshared, &q.key); + goto retry; } @@ -1628,7 +1583,7 @@ static int futex_unlock_pi(u32 __user *uaddr, int fshared) u32 uval; struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; - int ret, attempt = 0; + int ret; retry: if (get_user(uval, uaddr)) @@ -1644,7 +1599,6 @@ retry: goto out; hb = hash_futex(&key); -retry_unlocked: spin_lock(&hb->lock); /* @@ -1709,17 +1663,9 @@ pi_faulted: * we have to drop the mmap_sem in order to call get_user(). */ spin_unlock(&hb->lock); - - if (attempt++) { - ret = futex_handle_fault((unsigned long)uaddr, attempt); - if (ret) - goto out; - uval = 0; - goto retry_unlocked; - } + put_futex_key(fshared, &key); ret = get_user(uval, uaddr); - put_futex_key(fshared, &key); if (!ret) goto retry; -- cgit v1.1 From f21cfb258df6dd3ea0b3e56d75c7e994edb81b35 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 12 Mar 2009 21:05:42 +0900 Subject: irq: add remove_irq() for freeing of setup_irq() irqs Impact: add new API This patch adds a remove_irq() function for releasing interrupts requested with setup_irq(). Without this patch we have no way of releasing such interrupts since free_irq() today tries to kfree() the irqaction passed with setup_irq(). Signed-off-by: Magnus Damm LKML-Reference: <20090312120542.2926.56609.sendpatchset@rx1.opensource.se> Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 52ee171..8b069a7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -551,20 +551,14 @@ int setup_irq(unsigned int irq, struct irqaction *act) } /** - * free_irq - free an interrupt + * remove_irq - free an interrupt * @irq: Interrupt line to free * @dev_id: Device identity to free * - * Remove an interrupt handler. The handler is removed and if the - * interrupt line is no longer in use by any driver it is disabled. - * On a shared IRQ the caller must ensure the interrupt is disabled - * on the card it drives before calling this function. The function - * does not return until any executing interrupts for this IRQ - * have completed. - * - * This function must not be called from interrupt context. + * Used to remove interrupts statically setup by the early boot process. */ -void free_irq(unsigned int irq, void *dev_id) + +struct irqaction *remove_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action, **action_ptr; @@ -573,7 +567,7 @@ void free_irq(unsigned int irq, void *dev_id) WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); if (!desc) - return; + return NULL; spin_lock_irqsave(&desc->lock, flags); @@ -589,7 +583,7 @@ void free_irq(unsigned int irq, void *dev_id) WARN(1, "Trying to free already-free IRQ %d\n", irq); spin_unlock_irqrestore(&desc->lock, flags); - return; + return NULL; } if (action->dev_id == dev_id) @@ -636,7 +630,26 @@ void free_irq(unsigned int irq, void *dev_id) local_irq_restore(flags); } #endif - kfree(action); + return action; +} + +/** + * free_irq - free an interrupt allocated with request_irq + * @irq: Interrupt line to free + * @dev_id: Device identity to free + * + * Remove an interrupt handler. The handler is removed and if the + * interrupt line is no longer in use by any driver it is disabled. + * On a shared IRQ the caller must ensure the interrupt is disabled + * on the card it drives before calling this function. The function + * does not return until any executing interrupts for this IRQ + * have completed. + * + * This function must not be called from interrupt context. + */ +void free_irq(unsigned int irq, void *dev_id) +{ + kfree(remove_irq(irq, dev_id)); } EXPORT_SYMBOL(free_irq); -- cgit v1.1 From cbf94f06824780183e4bba165c7c29d5c7bd9a51 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 12 Mar 2009 21:05:51 +0900 Subject: irq: match remove_irq() args with setup_irq() Modify remove_irq() to match setup_irq(). Signed-off-by: Magnus Damm LKML-Reference: <20090312120551.2926.43942.sendpatchset@rx1.opensource.se> Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8b069a7..fc16570 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -550,15 +550,11 @@ int setup_irq(unsigned int irq, struct irqaction *act) return __setup_irq(irq, desc, act); } -/** - * remove_irq - free an interrupt - * @irq: Interrupt line to free - * @dev_id: Device identity to free - * - * Used to remove interrupts statically setup by the early boot process. + /* + * Internal function to unregister an irqaction - used to free + * regular and special interrupts that are part of the architecture. */ - -struct irqaction *remove_irq(unsigned int irq, void *dev_id) +static struct irqaction *__free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action, **action_ptr; @@ -634,6 +630,18 @@ struct irqaction *remove_irq(unsigned int irq, void *dev_id) } /** + * remove_irq - free an interrupt + * @irq: Interrupt line to free + * @act: irqaction for the interrupt + * + * Used to remove interrupts statically setup by the early boot process. + */ +void remove_irq(unsigned int irq, struct irqaction *act) +{ + __free_irq(irq, act->dev_id); +} + +/** * free_irq - free an interrupt allocated with request_irq * @irq: Interrupt line to free * @dev_id: Device identity to free @@ -649,7 +657,7 @@ struct irqaction *remove_irq(unsigned int irq, void *dev_id) */ void free_irq(unsigned int irq, void *dev_id) { - kfree(remove_irq(irq, dev_id)); + kfree(__free_irq(irq, dev_id)); } EXPORT_SYMBOL(free_irq); -- cgit v1.1 From eb53b4e8fef10ccccb49a6dbb5e19ca84ba5a305 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 12 Mar 2009 21:05:59 +0900 Subject: irq: export remove_irq() and setup_irq() symbols Export the setup_irq() and remove_irq() symbols. I'd like to export these functions since I have timer code that needs to use setup_irq() early on (too early for request_irq()), and the same code can also be compiled as a module. Signed-off-by: Magnus Damm LKML-Reference: <20090312120559.2926.82371.sendpatchset@rx1.opensource.se> [ changed to _GPL as these are special APIs deep inside the irq layer. ] Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fc16570..e28db0f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -549,6 +549,7 @@ int setup_irq(unsigned int irq, struct irqaction *act) return __setup_irq(irq, desc, act); } +EXPORT_SYMBOL_GPL(setup_irq); /* * Internal function to unregister an irqaction - used to free @@ -640,6 +641,7 @@ void remove_irq(unsigned int irq, struct irqaction *act) { __free_irq(irq, act->dev_id); } +EXPORT_SYMBOL_GPL(remove_irq); /** * free_irq - free an interrupt allocated with request_irq -- cgit v1.1 From f061d35150003b7fd5b133d14d66a74500fdaa60 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 12 Mar 2009 15:11:18 -0700 Subject: futex: remove the pointer math from double_unlock_hb Impact: simplify code I mistakenly included the pointer value ordering in the double_unlock_hb() in my previous patch. It's only necessary in the double_lock_hb() function. This patch removes it. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Rusty Russell LKML-Reference: <20090312221118.11146.68610.stgit@Aeon> Signed-off-by: Ingo Molnar --- kernel/futex.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 9c97f67..2331b73 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -658,14 +658,8 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) static inline void double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { - if (hb1 <= hb2) { - spin_unlock(&hb2->lock); - if (hb1 < hb2) - spin_unlock(&hb1->lock); - } else { /* hb1 > hb2 */ - spin_unlock(&hb1->lock); - spin_unlock(&hb2->lock); - } + spin_unlock(&hb1->lock); + spin_unlock(&hb2->lock); } /* -- cgit v1.1 From 88f502fedba82eff252b6420e8b8328e4ae25c67 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 10:32:07 +0100 Subject: futex: remove the pointer math from double_unlock_hb, fix Impact: fix double unlock crash Thomas Gleixner noticed that the simplified double_unlock_hb() became ... too unsophisticated: in the hb1 == hb2 case it will do a double unlock. Reported-by: Thomas Gleixner Cc: Darren Hart LKML-Reference: <20090312221118.11146.68610.stgit@Aeon> Signed-off-by: Ingo Molnar --- kernel/futex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 2331b73..6b50a02 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -659,7 +659,8 @@ static inline void double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { spin_unlock(&hb1->lock); - spin_unlock(&hb2->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); } /* -- cgit v1.1 From c8e2aeef0b8ac9fb8821b8b3734c031579d0b77a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 9 Mar 2009 20:26:23 +0100 Subject: genirq: remove redundant if condition Impact: cleanup The code is only compiled if CONFIG_GENERIC_HARDIRQS=y so another check for this define in the code is redundant. Remove it. Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e28db0f..4600f87 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -15,7 +15,7 @@ #include "internals.h" -#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) +#ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; /** -- cgit v1.1 From 4553573277906901f62f73c0432b332c53de5e2c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 22 Feb 2009 23:00:32 +0100 Subject: genirq: use kzalloc instead of explicit zero initialization Impact: simplification Signed-off-by: Thomas Gleixner Reviewed-by: Peter Zijlstra --- kernel/irq/manage.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4600f87..8a22039 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -737,15 +737,13 @@ int request_irq(unsigned int irq, irq_handler_t handler, if (!handler) return -EINVAL; - action = kmalloc(sizeof(struct irqaction), GFP_KERNEL); + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = irqflags; - cpus_clear(action->mask); action->name = devname; - action->next = NULL; action->dev_id = dev_id; retval = __setup_irq(irq, desc, action); -- cgit v1.1 From 0e57aa11abb15b70db53d1f95ae70b3c980ac885 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Mar 2009 14:34:05 +0100 Subject: genirq: deprecate __do_IRQ Two years migration time is enough. Remove the compability cruft. Add the deprecated warning in kernel/irq/handle.c because marking __do_IRQ itself is way too noisy. Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a2ee682..6661704 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -349,6 +349,11 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) } #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ + +#ifdef CONFIG_ENABLE_WARN_DEPRECATED +# warning __do_IRQ is deprecated. Please convert to proper flow handlers +#endif + /** * __do_IRQ - original all in one highlevel IRQ handler * @irq: the interrupt number -- cgit v1.1 From 80dd99b368cf6501be88ab517bbbb5bf352b75b8 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Mon, 16 Mar 2009 19:58:09 +0000 Subject: sched: fix typos in documentation Fixed typos in function documentation. Signed-off-by: Luis Henriques LKML-Reference: <20090316195809.GA6073@hades.domain.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2f28351..489e7d9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2082,7 +2082,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * it must be off the runqueue _entirely_, and not * preempted! * - * So if it wa still runnable (but just not actively + * So if it was still runnable (but just not actively * running right now), it's preempted, and we should * yield - it could be a while. */ @@ -2574,7 +2574,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) #ifdef CONFIG_PREEMPT_NOTIFIERS /** - * preempt_notifier_register - tell me when current is being being preempted & rescheduled + * preempt_notifier_register - tell me when current is being preempted & rescheduled * @notifier: notifier struct to register */ void preempt_notifier_register(struct preempt_notifier *notifier) -- cgit v1.1 From 708dc5125309cd33c5daaad3026cc4ae6ef39c8b Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Mon, 16 Mar 2009 19:59:02 +0000 Subject: sched: small optimisation of can_migrate_task() There were 3 invocations of task_hot() in can_migrate_task(). Replace these 3 invocations by only one invocation, cached in a local variable. Signed-off-by: Luis Henriques LKML-Reference: <20090316195902.GA6197@hades.domain.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 489e7d9..d2dfe4c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3002,6 +3002,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { + int tsk_cache_hot = 0; /* * We do not migrate tasks that are: * 1) running (obviously), or @@ -3025,10 +3026,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - if (!task_hot(p, rq->clock, sd) || - sd->nr_balance_failed > sd->cache_nice_tries) { + tsk_cache_hot = task_hot(p, rq->clock, sd); + if (!tsk_cache_hot || + sd->nr_balance_failed > sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS - if (task_hot(p, rq->clock, sd)) { + if (tsk_cache_hot) { schedstat_inc(sd, lb_hot_gained[idle]); schedstat_inc(p, se.nr_forced_migrations); } @@ -3036,7 +3038,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, return 1; } - if (task_hot(p, rq->clock, sd)) { + if (tsk_cache_hot) { schedstat_inc(p, se.nr_failed_migrations_hot); return 0; } -- cgit v1.1 From 6e2b75740bed35df98b8113300579e13ed2ce848 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 16 Mar 2009 18:13:36 -0400 Subject: module: fix refptr allocation and release order Impact: fix ref-after-free crash on failed module load Fix refptr bug: Change refptr allocation and release order not to access a module data structure pointed by 'mod' after freeing mod->module_core. This bug will cause kernel panic(e.g. failed to find undefined symbols). This bug was reported on systemtap bugzilla. http://sources.redhat.com/bugzilla/show_bug.cgi?id=9927 Signed-off-by: Masami Hiramatsu Cc: Eric Dumazet Signed-off-by: Rusty Russell --- kernel/module.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ba22484..1196f5d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2015,14 +2015,6 @@ static noinline struct module *load_module(void __user *umod, if (err < 0) goto free_mod; -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), - mod->name); - if (!mod->refptr) { - err = -ENOMEM; - goto free_mod; - } -#endif if (pcpuindex) { /* We have a special allocation for this section. */ percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, @@ -2030,7 +2022,7 @@ static noinline struct module *load_module(void __user *umod, mod->name); if (!percpu) { err = -ENOMEM; - goto free_percpu; + goto free_mod; } sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; mod->percpu = percpu; @@ -2082,6 +2074,14 @@ static noinline struct module *load_module(void __user *umod, /* Module has been moved. */ mod = (void *)sechdrs[modindex].sh_addr; +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) + mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), + mod->name); + if (!mod->refptr) { + err = -ENOMEM; + goto free_init; + } +#endif /* Now we've moved module, initialize linked lists, etc. */ module_unload_init(mod); @@ -2288,15 +2288,17 @@ static noinline struct module *load_module(void __user *umod, ftrace_release(mod->module_core, mod->core_size); free_unload: module_unload_free(mod); + free_init: +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) + percpu_modfree(mod->refptr); +#endif module_free(mod, mod->module_init); free_core: module_free(mod, mod->module_core); + /* mod will be freed with core. Don't access it beyond this line! */ free_percpu: if (percpu) percpu_modfree(percpu); -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - percpu_modfree(mod->refptr); -#endif free_mod: kfree(args); free_hdr: -- cgit v1.1 From af66df5ecf9c9e2d2ff86e8203510c1c4519d64c Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Wed, 18 Mar 2009 00:04:25 +0000 Subject: sched: jiffies not printed per CPU The jiffies value was being printed for each CPU, which does not seem to make sense. Moved jiffies to system section. Signed-off-by: Luis Henriques Acked-by: Peter Zijlstra LKML-Reference: <20090318000425.GA2228@hades.domain.com> Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 2b1260f..4daebff 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -272,7 +272,6 @@ static void print_cpu(struct seq_file *m, int cpu) P(nr_switches); P(nr_load_updates); P(nr_uninterruptible); - SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); PN(next_balance); P(curr->pid); PN(clock); @@ -325,6 +324,7 @@ static int sched_debug_show(struct seq_file *m, void *v) SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) #define PN(x) \ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + P(jiffies); PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); PN(sysctl_sched_wakeup_granularity); -- cgit v1.1 From 53da1d9456fe7f87a920a78fdbdcf1225d197cb7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 23 Mar 2009 16:07:24 +0100 Subject: fix ptrace slowness This patch fixes bug #12208: Bug-Entry : http://bugzilla.kernel.org/show_bug.cgi?id=12208 Subject : uml is very slow on 2.6.28 host This turned out to be not a scheduler regression, but an already existing problem in ptrace being triggered by subtle scheduler changes. The problem is this: - task A is ptracing task B - task B stops on a trace event - task A is woken up and preempts task B - task A calls ptrace on task B, which does ptrace_check_attach() - this calls wait_task_inactive(), which sees that task B is still on the runq - task A goes to sleep for a jiffy - ... Since UML does lots of the above sequences, those jiffies quickly add up to make it slow as hell. This patch solves this by not rescheduling in read_unlock() after ptrace_stop() has woken up the tracer. Thanks to Oleg Nesterov and Ingo Molnar for the feedback. Signed-off-by: Miklos Szeredi CC: stable@kernel.org Signed-off-by: Linus Torvalds --- kernel/signal.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2a74fe8..1c88144 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1575,7 +1575,15 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) read_lock(&tasklist_lock); if (may_ptrace_stop()) { do_notify_parent_cldstop(current, CLD_TRAPPED); + /* + * Don't want to allow preemption here, because + * sys_ptrace() needs this task to be inactive. + * + * XXX: implement read_unlock_no_resched(). + */ + preempt_disable(); read_unlock(&tasklist_lock); + preempt_enable_no_resched(); schedule(); } else { /* -- cgit v1.1 From 37bebc70d7ad4144c571d74500db3bb26ec0c0eb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 23 Mar 2009 20:34:11 +0100 Subject: posix timers: fix RLIMIT_CPU && fork() See http://bugzilla.kernel.org/show_bug.cgi?id=12911 copy_signal() copies signal->rlim, but RLIMIT_CPU is "lost". Because posix_cpu_timers_init_group() sets cputime_expires.prof_exp = 0 and thus fastpath_timer_check() returns false unless we have other cpu timers. This is the minimal fix for 2.6.29 (tested) and 2.6.28. The patch is not optimal, we need further cleanups here. With this patch update_rlimit_cpu() is not really needed, but I don't think it should be removed. The proper fix (I think) is: - set_process_cpu_timer() should just start the cputimer->running logic (it does), no need to change cputime_expires.xxx_exp - posix_cpu_timers_init_group() should set ->running when needed - fastpath_timer_check() can check ->running instead of task_cputime_zero(signal->cputime_expires) Reported-by: Peter Lojkin Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Roland McGrath Cc: [for 2.6.29.x] LKML-Reference: <20090323193411.GA17514@redhat.com> Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e976e50..8e5d9a6 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1370,7 +1370,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; } - return 0; + + return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY; } /* -- cgit v1.1 From 67aa0f767af488a7f1e41cccb4f7a4893f24a1ab Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 24 Mar 2009 22:10:02 +0000 Subject: sched: remove unused fields from struct rq Impact: cleanup, new schedstat ABI Since they are used on in statistics and are always set to zero, the following fields from struct rq have been removed: yld_exp_empty, yld_act_empty and yld_both_empty. Both Sched Debug and SCHEDSTAT_VERSION versions has also been incremented since ABIs have been changed. The schedtop tool has been updated to properly handle new version of schedstat: http://rt.wiki.kernel.org/index.php/Schedtop_utility Signed-off-by: Luis Henriques Acked-by: Gregory Haskins Acked-by: Peter Zijlstra LKML-Reference: <20090324221002.GA10061@hades.domain.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 --- kernel/sched_debug.c | 5 +---- kernel/sched_stats.h | 7 +++---- 3 files changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d2dfe4c..7b389c7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -638,9 +638,6 @@ struct rq { /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ - unsigned int yld_exp_empty; - unsigned int yld_act_empty; - unsigned int yld_both_empty; unsigned int yld_count; /* schedule() stats */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4daebff..467ca72 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -286,9 +286,6 @@ static void print_cpu(struct seq_file *m, int cpu) #ifdef CONFIG_SCHEDSTATS #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); - P(yld_exp_empty); - P(yld_act_empty); - P(yld_both_empty); P(yld_count); P(sched_switch); @@ -313,7 +310,7 @@ static int sched_debug_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n", + SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index a8f93dd..32d2bd4 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -4,7 +4,7 @@ * bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 14 +#define SCHEDSTAT_VERSION 15 static int show_schedstat(struct seq_file *seq, void *v) { @@ -26,9 +26,8 @@ static int show_schedstat(struct seq_file *seq, void *v) /* runqueue-specific stats */ seq_printf(seq, - "cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu", - cpu, rq->yld_both_empty, - rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, + "cpu%d %u %u %u %u %u %u %llu %llu %lu", + cpu, rq->yld_count, rq->sched_switch, rq->sched_count, rq->sched_goidle, rq->ttwu_count, rq->ttwu_local, rq->rq_cpu_time, -- cgit v1.1 From e9d376f0fa66bd630fe27403669c6ae6c22a868f Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 5 Feb 2009 11:51:38 -0500 Subject: dynamic debug: combine dprintk and dynamic printk This patch combines Greg Bank's dprintk() work with the existing dynamic printk patchset, we are now calling it 'dynamic debug'. The new feature of this patchset is a richer /debugfs control file interface, (an example output from my system is at the bottom), which allows fined grained control over the the debug output. The output can be controlled by function, file, module, format string, and line number. for example, enabled all debug messages in module 'nf_conntrack': echo -n 'module nf_conntrack +p' > /mnt/debugfs/dynamic_debug/control to disable them: echo -n 'module nf_conntrack -p' > /mnt/debugfs/dynamic_debug/control A further explanation can be found in the documentation patch. Signed-off-by: Greg Banks Signed-off-by: Jason Baron Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 1196f5d..7767223 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -822,7 +822,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, mutex_lock(&module_mutex); /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); - unregister_dynamic_debug_module(mod->name); + ddebug_remove_module(mod->name); free_module(mod); out: @@ -1827,19 +1827,13 @@ static inline void add_kallsyms(struct module *mod, } #endif /* CONFIG_KALLSYMS */ -static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num) +static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) { -#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG - unsigned int i; - - for (i = 0; i < num; i++) { - register_dynamic_debug_module(debug[i].modname, - debug[i].type, - debug[i].logical_modname, - debug[i].flag_names, - debug[i].hash, debug[i].hash2); - } -#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */ +#ifdef CONFIG_DYNAMIC_DEBUG + if (ddebug_add_module(debug, num, debug->modname)) + printk(KERN_ERR "dynamic debug error adding module: %s\n", + debug->modname); +#endif } static void *module_alloc_update_bounds(unsigned long size) @@ -2213,12 +2207,13 @@ static noinline struct module *load_module(void __user *umod, add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); if (!mod->taints) { - struct mod_debug *debug; + struct _ddebug *debug; unsigned int num_debug; debug = section_objs(hdr, sechdrs, secstrings, "__verbose", sizeof(*debug), &num_debug); - dynamic_printk_setup(debug, num_debug); + if (debug) + dynamic_debug_setup(debug, num_debug); } /* sechdrs[0].sh_size is always zero */ -- cgit v1.1 From 67bb6c036d1fc3d332c8527a36a546e3e72e822c Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:43:35 +0530 Subject: sched: Simple helper functions for find_busiest_group() Impact: cleanup Currently the load idx calculation code is in find_busiest_group(). Move that to a static inline helper function. Similary, to find the first cpu of a sched_group we use cpumask_first(sched_group_cpus(group)) Use a helper to that. It improves readability in some cases. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091335.13992.55424.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 55 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7b389c7..6aec1e7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3189,6 +3189,43 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } +/********** Helpers for find_busiest_group ************************/ + +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_cpus(group)); +} + +/** + * get_sd_load_idx - Obtain the load index for a given sched domain. + * @sd: The sched_domain whose load_idx is to be obtained. + * @idle: The Idle status of the CPU for whose sd load_icx is obtained. + */ +static inline int get_sd_load_idx(struct sched_domain *sd, + enum cpu_idle_type idle) +{ + int load_idx; + + switch (idle) { + case CPU_NOT_IDLE: + load_idx = sd->busy_idx; + break; + + case CPU_NEWLY_IDLE: + load_idx = sd->newidle_idx; + break; + default: + load_idx = sd->idle_idx; + break; + } + + return load_idx; +} +/******* find_busiest_group() helpers end here *********************/ /* * find_busiest_group finds and returns the busiest CPU group within the @@ -3217,12 +3254,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, busiest_load_per_task = busiest_nr_running = 0; this_load_per_task = this_nr_running = 0; - if (idle == CPU_NOT_IDLE) - load_idx = sd->busy_idx; - else if (idle == CPU_NEWLY_IDLE) - load_idx = sd->newidle_idx; - else - load_idx = sd->idle_idx; + load_idx = get_sd_load_idx(sd, idle); do { unsigned long load, group_capacity, max_cpu_load, min_cpu_load; @@ -3238,7 +3270,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, sched_group_cpus(group)); if (local_group) - balance_cpu = cpumask_first(sched_group_cpus(group)); + balance_cpu = group_first_cpu(group); /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@ -3359,8 +3391,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ if ((sum_nr_running < min_nr_running) || (sum_nr_running == min_nr_running && - cpumask_first(sched_group_cpus(group)) > - cpumask_first(sched_group_cpus(group_min)))) { + group_first_cpu(group) > group_first_cpu(group_min))) { group_min = group; min_nr_running = sum_nr_running; min_load_per_task = sum_weighted_load / @@ -3375,8 +3406,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sum_nr_running <= group_capacity - 1) { if (sum_nr_running > leader_nr_running || (sum_nr_running == leader_nr_running && - cpumask_first(sched_group_cpus(group)) < - cpumask_first(sched_group_cpus(group_leader)))) { + group_first_cpu(group) < + group_first_cpu(group_leader))) { group_leader = group; leader_nr_running = sum_nr_running; } @@ -3504,7 +3535,7 @@ out_balanced: *imbalance = min_load_per_task; if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - cpumask_first(sched_group_cpus(group_leader)); + group_first_cpu(group_leader); } return group_min; } -- cgit v1.1 From 6dfdb0629019f307ab18864b1fd3e5dbb02f383c Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:43:40 +0530 Subject: sched: Fix indentations in find_busiest_group() using gotos Impact: cleanup Some indentations in find_busiest_group() can minimized by using early exits with the help of gotos. This improves readability in a couple of cases. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091340.13992.45062.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6aec1e7..f87adbe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3403,14 +3403,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * capacity but still has some space to pick up some load * from other group and save more power */ - if (sum_nr_running <= group_capacity - 1) { - if (sum_nr_running > leader_nr_running || - (sum_nr_running == leader_nr_running && - group_first_cpu(group) < - group_first_cpu(group_leader))) { - group_leader = group; - leader_nr_running = sum_nr_running; - } + if (sum_nr_running > group_capacity - 1) + goto group_next; + + if (sum_nr_running > leader_nr_running || + (sum_nr_running == leader_nr_running && + group_first_cpu(group) < group_first_cpu(group_leader))) { + group_leader = group; + leader_nr_running = sum_nr_running; } group_next: #endif @@ -3531,14 +3531,16 @@ out_balanced: if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) goto ret; - if (this == group_leader && group_leader != group_min) { - *imbalance = min_load_per_task; - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(group_leader); - } - return group_min; + if (this != group_leader || group_leader == group_min) + goto ret; + + *imbalance = min_load_per_task; + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = + group_first_cpu(group_leader); } + return group_min; + #endif ret: *imbalance = 0; -- cgit v1.1 From 381be78fdc829a22f6327a0ed09f54b6270a976d Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:43:46 +0530 Subject: sched: Define structure to store the sched_group statistics for fbg() Impact: cleanup Currently a whole bunch of variables are used to store the various statistics pertaining to the groups we iterate over in find_busiest_group(). Group them together in a single data structure and add appropriate comments. This will be useful later on when we create helper functions to calculate the sched_group statistics. Credit: Vaidyanathan Srinivasan Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao LKML-Reference: <20090325091345.13992.20099.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 79 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f87adbe..109db12 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3192,6 +3192,18 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, /********** Helpers for find_busiest_group ************************/ /** + * sg_lb_stats - stats of a sched_group required for load_balancing + */ +struct sg_lb_stats { + unsigned long avg_load; /*Avg load across the CPUs of the group */ + unsigned long group_load; /* Total load over the CPUs of the group */ + unsigned long sum_nr_running; /* Nr tasks running in the group */ + unsigned long sum_weighted_load; /* Weighted load of group's tasks */ + unsigned long group_capacity; + int group_imb; /* Is there an imbalance in the group ? */ +}; + +/** * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. * @group: The group whose first cpu is to be returned. */ @@ -3257,23 +3269,22 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, load_idx = get_sd_load_idx(sd, idle); do { - unsigned long load, group_capacity, max_cpu_load, min_cpu_load; + struct sg_lb_stats sgs; + unsigned long load, max_cpu_load, min_cpu_load; int local_group; int i; - int __group_imb = 0; unsigned int balance_cpu = -1, first_idle_cpu = 0; - unsigned long sum_nr_running, sum_weighted_load; unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); + memset(&sgs, 0, sizeof(sgs)); if (local_group) balance_cpu = group_first_cpu(group); /* Tally up the load of all CPUs in the group */ - sum_weighted_load = sum_nr_running = avg_load = 0; sum_avg_load_per_task = avg_load_per_task = 0; max_cpu_load = 0; @@ -3301,9 +3312,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, min_cpu_load = load; } - avg_load += load; - sum_nr_running += rq->nr_running; - sum_weighted_load += weighted_cpuload(i); + sgs.group_load += load; + sgs.sum_nr_running += rq->nr_running; + sgs.sum_weighted_load += weighted_cpuload(i); sum_avg_load_per_task += cpu_avg_load_per_task(i); } @@ -3320,12 +3331,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, goto ret; } - total_load += avg_load; + total_load += sgs.group_load; total_pwr += group->__cpu_power; /* Adjust by relative CPU power of the group */ - avg_load = sg_div_cpu_power(group, - avg_load * SCHED_LOAD_SCALE); + sgs.avg_load = sg_div_cpu_power(group, + sgs.group_load * SCHED_LOAD_SCALE); /* @@ -3341,22 +3352,23 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, sum_avg_load_per_task * SCHED_LOAD_SCALE); if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) - __group_imb = 1; + sgs.group_imb = 1; - group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; + sgs.group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; if (local_group) { - this_load = avg_load; + this_load = sgs.avg_load; this = group; - this_nr_running = sum_nr_running; - this_load_per_task = sum_weighted_load; - } else if (avg_load > max_load && - (sum_nr_running > group_capacity || __group_imb)) { - max_load = avg_load; + this_nr_running = sgs.sum_nr_running; + this_load_per_task = sgs.sum_weighted_load; + } else if (sgs.avg_load > max_load && + (sgs.sum_nr_running > sgs.group_capacity || + sgs.group_imb)) { + max_load = sgs.avg_load; busiest = group; - busiest_nr_running = sum_nr_running; - busiest_load_per_task = sum_weighted_load; - group_imb = __group_imb; + busiest_nr_running = sgs.sum_nr_running; + busiest_load_per_task = sgs.sum_weighted_load; + group_imb = sgs.group_imb; } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -3372,7 +3384,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * If the local group is idle or completely loaded * no need to do power savings balance at this domain */ - if (local_group && (this_nr_running >= group_capacity || + if (local_group && (this_nr_running >= sgs.group_capacity || !this_nr_running)) power_savings_balance = 0; @@ -3380,8 +3392,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * If a group is already running at full capacity or idle, * don't include that group in power savings calculations */ - if (!power_savings_balance || sum_nr_running >= group_capacity - || !sum_nr_running) + if (!power_savings_balance || + sgs.sum_nr_running >= sgs.group_capacity || + !sgs.sum_nr_running) goto group_next; /* @@ -3389,13 +3402,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * This is the group from where we need to pick up the load * for saving power */ - if ((sum_nr_running < min_nr_running) || - (sum_nr_running == min_nr_running && + if ((sgs.sum_nr_running < min_nr_running) || + (sgs.sum_nr_running == min_nr_running && group_first_cpu(group) > group_first_cpu(group_min))) { group_min = group; - min_nr_running = sum_nr_running; - min_load_per_task = sum_weighted_load / - sum_nr_running; + min_nr_running = sgs.sum_nr_running; + min_load_per_task = sgs.sum_weighted_load / + sgs.sum_nr_running; } /* @@ -3403,14 +3416,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * capacity but still has some space to pick up some load * from other group and save more power */ - if (sum_nr_running > group_capacity - 1) + if (sgs.sum_nr_running > sgs.group_capacity - 1) goto group_next; - if (sum_nr_running > leader_nr_running || - (sum_nr_running == leader_nr_running && + if (sgs.sum_nr_running > leader_nr_running || + (sgs.sum_nr_running == leader_nr_running && group_first_cpu(group) < group_first_cpu(group_leader))) { group_leader = group; - leader_nr_running = sum_nr_running; + leader_nr_running = sgs.sum_nr_running; } group_next: #endif -- cgit v1.1 From 1f8c553d0f11d85f7993fe21015695d266771c00 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:43:51 +0530 Subject: sched: Create a helper function to calculate sched_group stats for fbg() Impact: cleanup Create a helper function named update_sg_lb_stats() which can be invoked to calculate the individual group's statistics in find_busiest_group(). This reduces the lenght of find_busiest_group() considerably. Credit: Vaidyanathan Srinivasan Signed-off-by: Gautham R Shenoy Aked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao LKML-Reference: <20090325091351.13992.43461.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 175 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 100 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 109db12..1893d55 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3237,6 +3237,103 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } + + +/** + * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @group: sched_group whose statistics are to be updated. + * @this_cpu: Cpu for which load balance is currently performed. + * @idle: Idle status of this_cpu + * @load_idx: Load index of sched_domain of this_cpu for load calc. + * @sd_idle: Idle status of the sched_domain containing group. + * @local_group: Does group contain this_cpu. + * @cpus: Set of cpus considered for load balancing. + * @balance: Should we balance. + * @sgs: variable to hold the statistics for this group. + */ +static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, + enum cpu_idle_type idle, int load_idx, int *sd_idle, + int local_group, const struct cpumask *cpus, + int *balance, struct sg_lb_stats *sgs) +{ + unsigned long load, max_cpu_load, min_cpu_load; + int i; + unsigned int balance_cpu = -1, first_idle_cpu = 0; + unsigned long sum_avg_load_per_task; + unsigned long avg_load_per_task; + + if (local_group) + balance_cpu = group_first_cpu(group); + + /* Tally up the load of all CPUs in the group */ + sum_avg_load_per_task = avg_load_per_task = 0; + max_cpu_load = 0; + min_cpu_load = ~0UL; + + for_each_cpu_and(i, sched_group_cpus(group), cpus) { + struct rq *rq = cpu_rq(i); + + if (*sd_idle && rq->nr_running) + *sd_idle = 0; + + /* Bias balancing toward cpus of our domain */ + if (local_group) { + if (idle_cpu(i) && !first_idle_cpu) { + first_idle_cpu = 1; + balance_cpu = i; + } + + load = target_load(i, load_idx); + } else { + load = source_load(i, load_idx); + if (load > max_cpu_load) + max_cpu_load = load; + if (min_cpu_load > load) + min_cpu_load = load; + } + + sgs->group_load += load; + sgs->sum_nr_running += rq->nr_running; + sgs->sum_weighted_load += weighted_cpuload(i); + + sum_avg_load_per_task += cpu_avg_load_per_task(i); + } + + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above + * domains. In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. + */ + if (idle != CPU_NEWLY_IDLE && local_group && + balance_cpu != this_cpu && balance) { + *balance = 0; + return; + } + + /* Adjust by relative CPU power of the group */ + sgs->avg_load = sg_div_cpu_power(group, + sgs->group_load * SCHED_LOAD_SCALE); + + + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of two tasks. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + avg_load_per_task = sg_div_cpu_power(group, + sum_avg_load_per_task * SCHED_LOAD_SCALE); + + if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) + sgs->group_imb = 1; + + sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; + +} /******* find_busiest_group() helpers end here *********************/ /* @@ -3270,92 +3367,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, do { struct sg_lb_stats sgs; - unsigned long load, max_cpu_load, min_cpu_load; int local_group; - int i; - unsigned int balance_cpu = -1, first_idle_cpu = 0; - unsigned long sum_avg_load_per_task; - unsigned long avg_load_per_task; local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); memset(&sgs, 0, sizeof(sgs)); + update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, + local_group, cpus, balance, &sgs); - if (local_group) - balance_cpu = group_first_cpu(group); - - /* Tally up the load of all CPUs in the group */ - sum_avg_load_per_task = avg_load_per_task = 0; - - max_cpu_load = 0; - min_cpu_load = ~0UL; - - for_each_cpu_and(i, sched_group_cpus(group), cpus) { - struct rq *rq = cpu_rq(i); - - if (*sd_idle && rq->nr_running) - *sd_idle = 0; - - /* Bias balancing toward cpus of our domain */ - if (local_group) { - if (idle_cpu(i) && !first_idle_cpu) { - first_idle_cpu = 1; - balance_cpu = i; - } - - load = target_load(i, load_idx); - } else { - load = source_load(i, load_idx); - if (load > max_cpu_load) - max_cpu_load = load; - if (min_cpu_load > load) - min_cpu_load = load; - } - - sgs.group_load += load; - sgs.sum_nr_running += rq->nr_running; - sgs.sum_weighted_load += weighted_cpuload(i); - - sum_avg_load_per_task += cpu_avg_load_per_task(i); - } - - /* - * First idle cpu or the first cpu(busiest) in this sched group - * is eligible for doing load balancing at this and above - * domains. In the newly idle case, we will allow all the cpu's - * to do the newly idle load balance. - */ - if (idle != CPU_NEWLY_IDLE && local_group && - balance_cpu != this_cpu && balance) { - *balance = 0; + if (balance && !(*balance)) goto ret; - } total_load += sgs.group_load; total_pwr += group->__cpu_power; - /* Adjust by relative CPU power of the group */ - sgs.avg_load = sg_div_cpu_power(group, - sgs.group_load * SCHED_LOAD_SCALE); - - - /* - * Consider the group unbalanced when the imbalance is larger - * than the average weight of two tasks. - * - * APZ: with cgroup the avg task weight can vary wildly and - * might not be a suitable number - should we keep a - * normalized nr_running number somewhere that negates - * the hierarchy? - */ - avg_load_per_task = sg_div_cpu_power(group, - sum_avg_load_per_task * SCHED_LOAD_SCALE); - - if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) - sgs.group_imb = 1; - - sgs.group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; - if (local_group) { this_load = sgs.avg_load; this = group; -- cgit v1.1 From 222d656dea57e4e084fbd1e9383e6fed2ca9fa61 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:43:56 +0530 Subject: sched: Define structure to store the sched_domain statistics for fbg() Impact: cleanup Currently we use a lot of local variables in find_busiest_group() to capture the various statistics related to the sched_domain. Group them together into a single data structure. This will help us to offload the job of updating the sched_domain statistics to a helper function. Credit: Vaidyanathan Srinivasan Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao LKML-Reference: <20090325091356.13992.25970.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 207 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 121 insertions(+), 86 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1893d55..8198dbe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3190,6 +3190,37 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } /********** Helpers for find_busiest_group ************************/ +/** + * sd_lb_stats - Structure to store the statistics of a sched_domain + * during load balancing. + */ +struct sd_lb_stats { + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *this; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_pwr; /* Total power of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + + /** Statistics of this group */ + unsigned long this_load; + unsigned long this_load_per_task; + unsigned long this_nr_running; + + /* Statistics of the busiest group */ + unsigned long max_load; + unsigned long busiest_load_per_task; + unsigned long busiest_nr_running; + + int group_imb; /* Is there imbalance in this sd */ +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + int power_savings_balance; /* Is powersave balance needed for this sd */ + struct sched_group *group_min; /* Least loaded group in sd */ + struct sched_group *group_leader; /* Group which relieves group_min */ + unsigned long min_load_per_task; /* load_per_task in group_min */ + unsigned long leader_nr_running; /* Nr running of group_leader */ + unsigned long min_nr_running; /* Nr running of group_min */ +#endif +}; /** * sg_lb_stats - stats of a sched_group required for load_balancing @@ -3346,23 +3377,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, int *sd_idle, const struct cpumask *cpus, int *balance) { - struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; - unsigned long max_load, avg_load, total_load, this_load, total_pwr; + struct sd_lb_stats sds; + struct sched_group *group = sd->groups; unsigned long max_pull; - unsigned long busiest_load_per_task, busiest_nr_running; - unsigned long this_load_per_task, this_nr_running; - int load_idx, group_imb = 0; + int load_idx; + + memset(&sds, 0, sizeof(sds)); #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - int power_savings_balance = 1; - unsigned long leader_nr_running = 0, min_load_per_task = 0; - unsigned long min_nr_running = ULONG_MAX; - struct sched_group *group_min = NULL, *group_leader = NULL; + sds.power_savings_balance = 1; + sds.min_nr_running = ULONG_MAX; #endif - - max_load = this_load = total_load = total_pwr = 0; - busiest_load_per_task = busiest_nr_running = 0; - this_load_per_task = this_nr_running = 0; - load_idx = get_sd_load_idx(sd, idle); do { @@ -3378,22 +3402,22 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (balance && !(*balance)) goto ret; - total_load += sgs.group_load; - total_pwr += group->__cpu_power; + sds.total_load += sgs.group_load; + sds.total_pwr += group->__cpu_power; if (local_group) { - this_load = sgs.avg_load; - this = group; - this_nr_running = sgs.sum_nr_running; - this_load_per_task = sgs.sum_weighted_load; - } else if (sgs.avg_load > max_load && + sds.this_load = sgs.avg_load; + sds.this = group; + sds.this_nr_running = sgs.sum_nr_running; + sds.this_load_per_task = sgs.sum_weighted_load; + } else if (sgs.avg_load > sds.max_load && (sgs.sum_nr_running > sgs.group_capacity || sgs.group_imb)) { - max_load = sgs.avg_load; - busiest = group; - busiest_nr_running = sgs.sum_nr_running; - busiest_load_per_task = sgs.sum_weighted_load; - group_imb = sgs.group_imb; + sds.max_load = sgs.avg_load; + sds.busiest = group; + sds.busiest_nr_running = sgs.sum_nr_running; + sds.busiest_load_per_task = sgs.sum_weighted_load; + sds.group_imb = sgs.group_imb; } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -3409,15 +3433,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * If the local group is idle or completely loaded * no need to do power savings balance at this domain */ - if (local_group && (this_nr_running >= sgs.group_capacity || - !this_nr_running)) - power_savings_balance = 0; + if (local_group && + (sds.this_nr_running >= sgs.group_capacity || + !sds.this_nr_running)) + sds.power_savings_balance = 0; /* * If a group is already running at full capacity or idle, * don't include that group in power savings calculations */ - if (!power_savings_balance || + if (!sds.power_savings_balance || sgs.sum_nr_running >= sgs.group_capacity || !sgs.sum_nr_running) goto group_next; @@ -3427,12 +3452,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * This is the group from where we need to pick up the load * for saving power */ - if ((sgs.sum_nr_running < min_nr_running) || - (sgs.sum_nr_running == min_nr_running && - group_first_cpu(group) > group_first_cpu(group_min))) { - group_min = group; - min_nr_running = sgs.sum_nr_running; - min_load_per_task = sgs.sum_weighted_load / + if ((sgs.sum_nr_running < sds.min_nr_running) || + (sgs.sum_nr_running == sds.min_nr_running && + group_first_cpu(group) > + group_first_cpu(sds.group_min))) { + sds.group_min = group; + sds.min_nr_running = sgs.sum_nr_running; + sds.min_load_per_task = sgs.sum_weighted_load / sgs.sum_nr_running; } @@ -3444,29 +3470,32 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sgs.sum_nr_running > sgs.group_capacity - 1) goto group_next; - if (sgs.sum_nr_running > leader_nr_running || - (sgs.sum_nr_running == leader_nr_running && - group_first_cpu(group) < group_first_cpu(group_leader))) { - group_leader = group; - leader_nr_running = sgs.sum_nr_running; + if (sgs.sum_nr_running > sds.leader_nr_running || + (sgs.sum_nr_running == sds.leader_nr_running && + group_first_cpu(group) < + group_first_cpu(sds.group_leader))) { + sds.group_leader = group; + sds.leader_nr_running = sgs.sum_nr_running; } group_next: #endif group = group->next; } while (group != sd->groups); - if (!busiest || this_load >= max_load || busiest_nr_running == 0) + if (!sds.busiest || sds.this_load >= sds.max_load + || sds.busiest_nr_running == 0) goto out_balanced; - avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; - if (this_load >= avg_load || - 100*max_load <= sd->imbalance_pct*this_load) + if (sds.this_load >= sds.avg_load || + 100*sds.max_load <= sd->imbalance_pct * sds.this_load) goto out_balanced; - busiest_load_per_task /= busiest_nr_running; - if (group_imb) - busiest_load_per_task = min(busiest_load_per_task, avg_load); + sds.busiest_load_per_task /= sds.busiest_nr_running; + if (sds.group_imb) + sds.busiest_load_per_task = + min(sds.busiest_load_per_task, sds.avg_load); /* * We're trying to get all the cpus to the average_load, so we don't @@ -3479,7 +3508,7 @@ group_next: * by pulling tasks to us. Be careful of negative numbers as they'll * appear as very large values with unsigned longs. */ - if (max_load <= busiest_load_per_task) + if (sds.max_load <= sds.busiest_load_per_task) goto out_balanced; /* @@ -3487,17 +3516,18 @@ group_next: * max load less than avg load(as we skip the groups at or below * its cpu_power, while calculating max_load..) */ - if (max_load < avg_load) { + if (sds.max_load < sds.avg_load) { *imbalance = 0; goto small_imbalance; } /* Don't want to pull so many tasks that a group would go idle */ - max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); + max_pull = min(sds.max_load - sds.avg_load, + sds.max_load - sds.busiest_load_per_task); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * busiest->__cpu_power, - (avg_load - this_load) * this->__cpu_power) + *imbalance = min(max_pull * sds.busiest->__cpu_power, + (sds.avg_load - sds.this_load) * sds.this->__cpu_power) / SCHED_LOAD_SCALE; /* @@ -3506,24 +3536,27 @@ group_next: * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance < busiest_load_per_task) { + if (*imbalance < sds.busiest_load_per_task) { unsigned long tmp, pwr_now, pwr_move; unsigned int imbn; small_imbalance: pwr_move = pwr_now = 0; imbn = 2; - if (this_nr_running) { - this_load_per_task /= this_nr_running; - if (busiest_load_per_task > this_load_per_task) + if (sds.this_nr_running) { + sds.this_load_per_task /= sds.this_nr_running; + if (sds.busiest_load_per_task > + sds.this_load_per_task) imbn = 1; } else - this_load_per_task = cpu_avg_load_per_task(this_cpu); - - if (max_load - this_load + busiest_load_per_task >= - busiest_load_per_task * imbn) { - *imbalance = busiest_load_per_task; - return busiest; + sds.this_load_per_task = + cpu_avg_load_per_task(this_cpu); + + if (sds.max_load - sds.this_load + + sds.busiest_load_per_task >= + sds.busiest_load_per_task * imbn) { + *imbalance = sds.busiest_load_per_task; + return sds.busiest; } /* @@ -3532,52 +3565,54 @@ small_imbalance: * moving them. */ - pwr_now += busiest->__cpu_power * - min(busiest_load_per_task, max_load); - pwr_now += this->__cpu_power * - min(this_load_per_task, this_load); + pwr_now += sds.busiest->__cpu_power * + min(sds.busiest_load_per_task, sds.max_load); + pwr_now += sds.this->__cpu_power * + min(sds.this_load_per_task, sds.this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = sg_div_cpu_power(busiest, - busiest_load_per_task * SCHED_LOAD_SCALE); - if (max_load > tmp) - pwr_move += busiest->__cpu_power * - min(busiest_load_per_task, max_load - tmp); + tmp = sg_div_cpu_power(sds.busiest, + sds.busiest_load_per_task * SCHED_LOAD_SCALE); + if (sds.max_load > tmp) + pwr_move += sds.busiest->__cpu_power * + min(sds.busiest_load_per_task, + sds.max_load - tmp); /* Amount of load we'd add */ - if (max_load * busiest->__cpu_power < - busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = sg_div_cpu_power(this, - max_load * busiest->__cpu_power); + if (sds.max_load * sds.busiest->__cpu_power < + sds.busiest_load_per_task * SCHED_LOAD_SCALE) + tmp = sg_div_cpu_power(sds.this, + sds.max_load * sds.busiest->__cpu_power); else - tmp = sg_div_cpu_power(this, - busiest_load_per_task * SCHED_LOAD_SCALE); - pwr_move += this->__cpu_power * - min(this_load_per_task, this_load + tmp); + tmp = sg_div_cpu_power(sds.this, + sds.busiest_load_per_task * SCHED_LOAD_SCALE); + pwr_move += sds.this->__cpu_power * + min(sds.this_load_per_task, + sds.this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ if (pwr_move > pwr_now) - *imbalance = busiest_load_per_task; + *imbalance = sds.busiest_load_per_task; } - return busiest; + return sds.busiest; out_balanced: #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) goto ret; - if (this != group_leader || group_leader == group_min) + if (sds.this != sds.group_leader || sds.group_leader == sds.group_min) goto ret; - *imbalance = min_load_per_task; + *imbalance = sds.min_load_per_task; if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(group_leader); + group_first_cpu(sds.group_leader); } - return group_min; + return sds.group_min; #endif ret: -- cgit v1.1 From 37abe198b1246ddd206319c43502a687db62d347 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:01 +0530 Subject: sched: Create a helper function to calculate sched_domain stats for fbg() Impact: cleanup Create a helper function named update_sd_lb_stats() to update the various sched_domain related statistics in find_busiest_group(). With this we would have moved all the statistics computation out of find_busiest_group(). Credit: Vaidyanathan Srinivasan Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao LKML-Reference: <20090325091401.13992.88737.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 117 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 73 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8198dbe..ec715f9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3365,32 +3365,33 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; } -/******* find_busiest_group() helpers end here *********************/ -/* - * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the amount of weighted load which - * should be moved to restore balance via the imbalance parameter. +/** + * update_sd_lb_stats - Update sched_group's statistics for load balancing. + * @sd: sched_domain whose statistics are to be updated. + * @this_cpu: Cpu for which load balance is currently performed. + * @idle: Idle status of this_cpu + * @sd_idle: Idle status of the sched_domain containing group. + * @cpus: Set of cpus considered for load balancing. + * @balance: Should we balance. + * @sds: variable to hold the statistics for this sched_domain. */ -static struct sched_group * -find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const struct cpumask *cpus, int *balance) +static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, + enum cpu_idle_type idle, int *sd_idle, + const struct cpumask *cpus, int *balance, + struct sd_lb_stats *sds) { - struct sd_lb_stats sds; struct sched_group *group = sd->groups; - unsigned long max_pull; + struct sg_lb_stats sgs; int load_idx; - memset(&sds, 0, sizeof(sds)); #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - sds.power_savings_balance = 1; - sds.min_nr_running = ULONG_MAX; + sds->power_savings_balance = 1; + sds->min_nr_running = ULONG_MAX; #endif load_idx = get_sd_load_idx(sd, idle); do { - struct sg_lb_stats sgs; int local_group; local_group = cpumask_test_cpu(this_cpu, @@ -3399,25 +3400,25 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs); - if (balance && !(*balance)) - goto ret; + if (local_group && balance && !(*balance)) + return; - sds.total_load += sgs.group_load; - sds.total_pwr += group->__cpu_power; + sds->total_load += sgs.group_load; + sds->total_pwr += group->__cpu_power; if (local_group) { - sds.this_load = sgs.avg_load; - sds.this = group; - sds.this_nr_running = sgs.sum_nr_running; - sds.this_load_per_task = sgs.sum_weighted_load; - } else if (sgs.avg_load > sds.max_load && + sds->this_load = sgs.avg_load; + sds->this = group; + sds->this_nr_running = sgs.sum_nr_running; + sds->this_load_per_task = sgs.sum_weighted_load; + } else if (sgs.avg_load > sds->max_load && (sgs.sum_nr_running > sgs.group_capacity || sgs.group_imb)) { - sds.max_load = sgs.avg_load; - sds.busiest = group; - sds.busiest_nr_running = sgs.sum_nr_running; - sds.busiest_load_per_task = sgs.sum_weighted_load; - sds.group_imb = sgs.group_imb; + sds->max_load = sgs.avg_load; + sds->busiest = group; + sds->busiest_nr_running = sgs.sum_nr_running; + sds->busiest_load_per_task = sgs.sum_weighted_load; + sds->group_imb = sgs.group_imb; } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -3434,15 +3435,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * no need to do power savings balance at this domain */ if (local_group && - (sds.this_nr_running >= sgs.group_capacity || - !sds.this_nr_running)) - sds.power_savings_balance = 0; + (sds->this_nr_running >= sgs.group_capacity || + !sds->this_nr_running)) + sds->power_savings_balance = 0; /* * If a group is already running at full capacity or idle, * don't include that group in power savings calculations */ - if (!sds.power_savings_balance || + if (!sds->power_savings_balance || sgs.sum_nr_running >= sgs.group_capacity || !sgs.sum_nr_running) goto group_next; @@ -3452,13 +3453,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * This is the group from where we need to pick up the load * for saving power */ - if ((sgs.sum_nr_running < sds.min_nr_running) || - (sgs.sum_nr_running == sds.min_nr_running && + if ((sgs.sum_nr_running < sds->min_nr_running) || + (sgs.sum_nr_running == sds->min_nr_running && group_first_cpu(group) > - group_first_cpu(sds.group_min))) { - sds.group_min = group; - sds.min_nr_running = sgs.sum_nr_running; - sds.min_load_per_task = sgs.sum_weighted_load / + group_first_cpu(sds->group_min))) { + sds->group_min = group; + sds->min_nr_running = sgs.sum_nr_running; + sds->min_load_per_task = sgs.sum_weighted_load / sgs.sum_nr_running; } @@ -3470,18 +3471,46 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sgs.sum_nr_running > sgs.group_capacity - 1) goto group_next; - if (sgs.sum_nr_running > sds.leader_nr_running || - (sgs.sum_nr_running == sds.leader_nr_running && + if (sgs.sum_nr_running > sds->leader_nr_running || + (sgs.sum_nr_running == sds->leader_nr_running && group_first_cpu(group) < - group_first_cpu(sds.group_leader))) { - sds.group_leader = group; - sds.leader_nr_running = sgs.sum_nr_running; + group_first_cpu(sds->group_leader))) { + sds->group_leader = group; + sds->leader_nr_running = sgs.sum_nr_running; } group_next: #endif group = group->next; } while (group != sd->groups); +} +/******* find_busiest_group() helpers end here *********************/ + +/* + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the amount of weighted load which + * should be moved to restore balance via the imbalance parameter. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum cpu_idle_type idle, + int *sd_idle, const struct cpumask *cpus, int *balance) +{ + struct sd_lb_stats sds; + unsigned long max_pull; + + memset(&sds, 0, sizeof(sds)); + + /* + * Compute the various statistics relavent for load balancing at + * this level. + */ + update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, + balance, &sds); + + if (balance && !(*balance)) + goto ret; + if (!sds.busiest || sds.this_load >= sds.max_load || sds.busiest_nr_running == 0) goto out_balanced; -- cgit v1.1 From 2e6f44aeda426054fc58464df1ad571aecca0c92 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:06 +0530 Subject: sched: Create helper to calculate small_imbalance in fbg() Impact: cleanup We have two places in find_busiest_group() where we need to calculate the minor imbalance before returning the busiest group. Encapsulate this functionality into a seperate helper function. Credit: Vaidyanathan Srinivasan Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao LKML-Reference: <20090325091406.13992.54316.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 131 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 70 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ec715f9..540147e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3484,6 +3484,71 @@ group_next: } while (group != sd->groups); } + +/** + * fix_small_imbalance - Calculate the minor imbalance that exists + * amongst the groups of a sched_domain, during + * load balancing. + * @sds: Statistics of the sched_domain whose imbalance is to be calculated. + * @this_cpu: The cpu at whose sched_domain we're performing load-balance. + * @imbalance: Variable to store the imbalance. + */ +static inline void fix_small_imbalance(struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + unsigned long tmp, pwr_now = 0, pwr_move = 0; + unsigned int imbn = 2; + + if (sds->this_nr_running) { + sds->this_load_per_task /= sds->this_nr_running; + if (sds->busiest_load_per_task > + sds->this_load_per_task) + imbn = 1; + } else + sds->this_load_per_task = + cpu_avg_load_per_task(this_cpu); + + if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= + sds->busiest_load_per_task * imbn) { + *imbalance = sds->busiest_load_per_task; + return; + } + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += sds->busiest->__cpu_power * + min(sds->busiest_load_per_task, sds->max_load); + pwr_now += sds->this->__cpu_power * + min(sds->this_load_per_task, sds->this_load); + pwr_now /= SCHED_LOAD_SCALE; + + /* Amount of load we'd subtract */ + tmp = sg_div_cpu_power(sds->busiest, + sds->busiest_load_per_task * SCHED_LOAD_SCALE); + if (sds->max_load > tmp) + pwr_move += sds->busiest->__cpu_power * + min(sds->busiest_load_per_task, sds->max_load - tmp); + + /* Amount of load we'd add */ + if (sds->max_load * sds->busiest->__cpu_power < + sds->busiest_load_per_task * SCHED_LOAD_SCALE) + tmp = sg_div_cpu_power(sds->this, + sds->max_load * sds->busiest->__cpu_power); + else + tmp = sg_div_cpu_power(sds->this, + sds->busiest_load_per_task * SCHED_LOAD_SCALE); + pwr_move += sds->this->__cpu_power * + min(sds->this_load_per_task, sds->this_load + tmp); + pwr_move /= SCHED_LOAD_SCALE; + + /* Move if we gain throughput */ + if (pwr_move > pwr_now) + *imbalance = sds->busiest_load_per_task; +} /******* find_busiest_group() helpers end here *********************/ /* @@ -3547,7 +3612,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ if (sds.max_load < sds.avg_load) { *imbalance = 0; - goto small_imbalance; + fix_small_imbalance(&sds, this_cpu, imbalance); + goto ret_busiest; } /* Don't want to pull so many tasks that a group would go idle */ @@ -3565,67 +3631,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance < sds.busiest_load_per_task) { - unsigned long tmp, pwr_now, pwr_move; - unsigned int imbn; - -small_imbalance: - pwr_move = pwr_now = 0; - imbn = 2; - if (sds.this_nr_running) { - sds.this_load_per_task /= sds.this_nr_running; - if (sds.busiest_load_per_task > - sds.this_load_per_task) - imbn = 1; - } else - sds.this_load_per_task = - cpu_avg_load_per_task(this_cpu); - - if (sds.max_load - sds.this_load + - sds.busiest_load_per_task >= - sds.busiest_load_per_task * imbn) { - *imbalance = sds.busiest_load_per_task; - return sds.busiest; - } - - /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by - * moving them. - */ - - pwr_now += sds.busiest->__cpu_power * - min(sds.busiest_load_per_task, sds.max_load); - pwr_now += sds.this->__cpu_power * - min(sds.this_load_per_task, sds.this_load); - pwr_now /= SCHED_LOAD_SCALE; - - /* Amount of load we'd subtract */ - tmp = sg_div_cpu_power(sds.busiest, - sds.busiest_load_per_task * SCHED_LOAD_SCALE); - if (sds.max_load > tmp) - pwr_move += sds.busiest->__cpu_power * - min(sds.busiest_load_per_task, - sds.max_load - tmp); - - /* Amount of load we'd add */ - if (sds.max_load * sds.busiest->__cpu_power < - sds.busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = sg_div_cpu_power(sds.this, - sds.max_load * sds.busiest->__cpu_power); - else - tmp = sg_div_cpu_power(sds.this, - sds.busiest_load_per_task * SCHED_LOAD_SCALE); - pwr_move += sds.this->__cpu_power * - min(sds.this_load_per_task, - sds.this_load + tmp); - pwr_move /= SCHED_LOAD_SCALE; - - /* Move if we gain throughput */ - if (pwr_move > pwr_now) - *imbalance = sds.busiest_load_per_task; - } + if (*imbalance < sds.busiest_load_per_task) + fix_small_imbalance(&sds, this_cpu, imbalance); +ret_busiest: return sds.busiest; out_balanced: -- cgit v1.1 From dbc523a3b86f9e1765b5e70e6886913b99cc5cec Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:12 +0530 Subject: sched: Create a helper function to calculate imbalance Move all the imbalance calculation out of find_busiest_group() through this helper function. With this change, the structure of find_busiest_group() will be as follows: - update_sched_domain_statistics. - check if imbalance exits. - update imbalance and return busiest. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091411.13992.43293.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 78 +++++++++++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 540147e..934f615 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3487,8 +3487,8 @@ group_next: /** * fix_small_imbalance - Calculate the minor imbalance that exists - * amongst the groups of a sched_domain, during - * load balancing. + * amongst the groups of a sched_domain, during + * load balancing. * @sds: Statistics of the sched_domain whose imbalance is to be calculated. * @this_cpu: The cpu at whose sched_domain we're performing load-balance. * @imbalance: Variable to store the imbalance. @@ -3549,6 +3549,47 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, if (pwr_move > pwr_now) *imbalance = sds->busiest_load_per_task; } + +/** + * calculate_imbalance - Calculate the amount of imbalance present within the + * groups of a given sched_domain during load balance. + * @sds: statistics of the sched_domain whose imbalance is to be calculated. + * @this_cpu: Cpu for which currently load balance is being performed. + * @imbalance: The variable to store the imbalance. + */ +static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, + unsigned long *imbalance) +{ + unsigned long max_pull; + /* + * In the presence of smp nice balancing, certain scenarios can have + * max load less than avg load(as we skip the groups at or below + * its cpu_power, while calculating max_load..) + */ + if (sds->max_load < sds->avg_load) { + *imbalance = 0; + return fix_small_imbalance(sds, this_cpu, imbalance); + } + + /* Don't want to pull so many tasks that a group would go idle */ + max_pull = min(sds->max_load - sds->avg_load, + sds->max_load - sds->busiest_load_per_task); + + /* How much load to actually move to equalise the imbalance */ + *imbalance = min(max_pull * sds->busiest->__cpu_power, + (sds->avg_load - sds->this_load) * sds->this->__cpu_power) + / SCHED_LOAD_SCALE; + + /* + * if *imbalance is less than the average load per runnable task + * there is no gaurantee that any tasks will be moved so we'll have + * a think about bumping its value to force at least one task to be + * moved + */ + if (*imbalance < sds->busiest_load_per_task) + return fix_small_imbalance(sds, this_cpu, imbalance); + +} /******* find_busiest_group() helpers end here *********************/ /* @@ -3562,7 +3603,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, int *sd_idle, const struct cpumask *cpus, int *balance) { struct sd_lb_stats sds; - unsigned long max_pull; memset(&sds, 0, sizeof(sds)); @@ -3605,36 +3645,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sds.max_load <= sds.busiest_load_per_task) goto out_balanced; - /* - * In the presence of smp nice balancing, certain scenarios can have - * max load less than avg load(as we skip the groups at or below - * its cpu_power, while calculating max_load..) - */ - if (sds.max_load < sds.avg_load) { - *imbalance = 0; - fix_small_imbalance(&sds, this_cpu, imbalance); - goto ret_busiest; - } - - /* Don't want to pull so many tasks that a group would go idle */ - max_pull = min(sds.max_load - sds.avg_load, - sds.max_load - sds.busiest_load_per_task); - - /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * sds.busiest->__cpu_power, - (sds.avg_load - sds.this_load) * sds.this->__cpu_power) - / SCHED_LOAD_SCALE; - - /* - * if *imbalance is less than the average load per runnable task - * there is no gaurantee that any tasks will be moved so we'll have - * a think about bumping its value to force at least one task to be - * moved - */ - if (*imbalance < sds.busiest_load_per_task) - fix_small_imbalance(&sds, this_cpu, imbalance); - -ret_busiest: + /* Looks like there is an imbalance. Compute it */ + calculate_imbalance(&sds, this_cpu, imbalance); return sds.busiest; out_balanced: -- cgit v1.1 From a021dc03376707c55a3483e32c16b8986d4414cc Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:17 +0530 Subject: sched: Optimize the !power_savings_balance during fbg() Impact: cleanup, micro-optimization We don't need to perform power_savings balance if either the cpu is NOT_IDLE or if the sched_domain doesn't contain the SD_POWERSAVINGS_BALANCE flag set. Currently, we check for these conditions multiple number of times, even though these variables don't change over the scope of find_busiest_group(). Check once, and store the value in the already exiting "power_savings_balance" variable. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091417.13992.2657.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 934f615..71e8dca 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3386,8 +3386,17 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, int load_idx; #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - sds->power_savings_balance = 1; - sds->min_nr_running = ULONG_MAX; + /* + * Busy processors will not participate in power savings + * balance. + */ + if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + sds->power_savings_balance = 0; + else { + sds->power_savings_balance = 1; + sds->min_nr_running = ULONG_MAX; + sds->leader_nr_running = 0; + } #endif load_idx = get_sd_load_idx(sd, idle); @@ -3422,12 +3431,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Busy processors will not participate in power savings - * balance. - */ - if (idle == CPU_NOT_IDLE || - !(sd->flags & SD_POWERSAVINGS_BALANCE)) + + if (!sds->power_savings_balance) goto group_next; /* @@ -3651,7 +3656,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, out_balanced: #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + if (!sds.power_savings_balance) goto ret; if (sds.this != sds.group_leader || sds.group_leader == sds.group_min) -- cgit v1.1 From c071df18525a95b37dd5821a6dc4af83bd18675e Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:22 +0530 Subject: sched: Refactor the power savings balance code Impact: cleanup Create seperate helper functions to initialize the power-savings-balance related variables, to update them and to check if we have a scope for performing power-savings balance. Add no-op inline functions for the !(CONFIG_SCHED_MC || CONFIG_SCHED_SMT) case. This will eliminate all the #ifdef jungle in find_busiest_group() and the other helper functions. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091422.13992.73616.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 236 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 153 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 71e8dca..5f21658 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3270,6 +3270,151 @@ static inline int get_sd_load_idx(struct sched_domain *sd, } +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * init_sd_power_savings_stats - Initialize power savings statistics for + * the given sched_domain, during load balancing. + * + * @sd: Sched domain whose power-savings statistics are to be initialized. + * @sds: Variable containing the statistics for sd. + * @idle: Idle status of the CPU at which we're performing load-balancing. + */ +static inline void init_sd_power_savings_stats(struct sched_domain *sd, + struct sd_lb_stats *sds, enum cpu_idle_type idle) +{ + /* + * Busy processors will not participate in power savings + * balance. + */ + if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + sds->power_savings_balance = 0; + else { + sds->power_savings_balance = 1; + sds->min_nr_running = ULONG_MAX; + sds->leader_nr_running = 0; + } +} + +/** + * update_sd_power_savings_stats - Update the power saving stats for a + * sched_domain while performing load balancing. + * + * @group: sched_group belonging to the sched_domain under consideration. + * @sds: Variable containing the statistics of the sched_domain + * @local_group: Does group contain the CPU for which we're performing + * load balancing ? + * @sgs: Variable containing the statistics of the group. + */ +static inline void update_sd_power_savings_stats(struct sched_group *group, + struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +{ + + if (!sds->power_savings_balance) + return; + + /* + * If the local group is idle or completely loaded + * no need to do power savings balance at this domain + */ + if (local_group && (sds->this_nr_running >= sgs->group_capacity || + !sds->this_nr_running)) + sds->power_savings_balance = 0; + + /* + * If a group is already running at full capacity or idle, + * don't include that group in power savings calculations + */ + if (!sds->power_savings_balance || + sgs->sum_nr_running >= sgs->group_capacity || + !sgs->sum_nr_running) + return; + + /* + * Calculate the group which has the least non-idle load. + * This is the group from where we need to pick up the load + * for saving power + */ + if ((sgs->sum_nr_running < sds->min_nr_running) || + (sgs->sum_nr_running == sds->min_nr_running && + group_first_cpu(group) > group_first_cpu(sds->group_min))) { + sds->group_min = group; + sds->min_nr_running = sgs->sum_nr_running; + sds->min_load_per_task = sgs->sum_weighted_load / + sgs->sum_nr_running; + } + + /* + * Calculate the group which is almost near its + * capacity but still has some space to pick up some load + * from other group and save more power + */ + if (sgs->sum_nr_running > sgs->group_capacity - 1) + return; + + if (sgs->sum_nr_running > sds->leader_nr_running || + (sgs->sum_nr_running == sds->leader_nr_running && + group_first_cpu(group) < group_first_cpu(sds->group_leader))) { + sds->group_leader = group; + sds->leader_nr_running = sgs->sum_nr_running; + } +} + +/** + * check_power_save_busiest_group - Check if we have potential to perform + * some power-savings balance. If yes, set the busiest group to be + * the least loaded group in the sched_domain, so that it's CPUs can + * be put to idle. + * + * @sds: Variable containing the statistics of the sched_domain + * under consideration. + * @this_cpu: Cpu at which we're currently performing load-balancing. + * @imbalance: Variable to store the imbalance. + * + * Returns 1 if there is potential to perform power-savings balance. + * Else returns 0. + */ +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + if (!sds->power_savings_balance) + return 0; + + if (sds->this != sds->group_leader || + sds->group_leader == sds->group_min) + return 0; + + *imbalance = sds->min_load_per_task; + sds->busiest = sds->group_min; + + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = + group_first_cpu(sds->group_leader); + } + + return 1; + +} +#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +static inline void init_sd_power_savings_stats(struct sched_domain *sd, + struct sd_lb_stats *sds, enum cpu_idle_type idle) +{ + return; +} + +static inline void update_sd_power_savings_stats(struct sched_group *group, + struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +{ + return; +} + +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + return 0; +} +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ + + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @group: sched_group whose statistics are to be updated. @@ -3385,19 +3530,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, struct sg_lb_stats sgs; int load_idx; -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Busy processors will not participate in power savings - * balance. - */ - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) - sds->power_savings_balance = 0; - else { - sds->power_savings_balance = 1; - sds->min_nr_running = ULONG_MAX; - sds->leader_nr_running = 0; - } -#endif + init_sd_power_savings_stats(sd, sds, idle); load_idx = get_sd_load_idx(sd, idle); do { @@ -3430,61 +3563,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, sds->group_imb = sgs.group_imb; } -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - - if (!sds->power_savings_balance) - goto group_next; - - /* - * If the local group is idle or completely loaded - * no need to do power savings balance at this domain - */ - if (local_group && - (sds->this_nr_running >= sgs.group_capacity || - !sds->this_nr_running)) - sds->power_savings_balance = 0; - - /* - * If a group is already running at full capacity or idle, - * don't include that group in power savings calculations - */ - if (!sds->power_savings_balance || - sgs.sum_nr_running >= sgs.group_capacity || - !sgs.sum_nr_running) - goto group_next; - - /* - * Calculate the group which has the least non-idle load. - * This is the group from where we need to pick up the load - * for saving power - */ - if ((sgs.sum_nr_running < sds->min_nr_running) || - (sgs.sum_nr_running == sds->min_nr_running && - group_first_cpu(group) > - group_first_cpu(sds->group_min))) { - sds->group_min = group; - sds->min_nr_running = sgs.sum_nr_running; - sds->min_load_per_task = sgs.sum_weighted_load / - sgs.sum_nr_running; - } - - /* - * Calculate the group which is almost near its - * capacity but still has some space to pick up some load - * from other group and save more power - */ - if (sgs.sum_nr_running > sgs.group_capacity - 1) - goto group_next; - - if (sgs.sum_nr_running > sds->leader_nr_running || - (sgs.sum_nr_running == sds->leader_nr_running && - group_first_cpu(group) < - group_first_cpu(sds->group_leader))) { - sds->group_leader = group; - sds->leader_nr_running = sgs.sum_nr_running; - } -group_next: -#endif + update_sd_power_savings_stats(group, sds, local_group, &sgs); group = group->next; } while (group != sd->groups); @@ -3655,21 +3734,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, return sds.busiest; out_balanced: -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - if (!sds.power_savings_balance) - goto ret; - - if (sds.this != sds.group_leader || sds.group_leader == sds.group_min) - goto ret; - - *imbalance = sds.min_load_per_task; - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(sds.group_leader); - } - return sds.group_min; - -#endif + /* + * There is no obvious imbalance. But check if we can do some balancing + * to save power. + */ + if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) + return sds.busiest; ret: *imbalance = 0; return NULL; -- cgit v1.1 From b7bb4c9bb01941fe8feb653f3410e7ed0c9bb786 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:27 +0530 Subject: sched: Add comments to find_busiest_group() function Impact: cleanup Add /** style comments around find_busiest_group(). Also add a few explanatory comments. This concludes the find_busiest_group() cleanup. The function is now down to 72 lines from the original 313 lines. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091427.13992.18933.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 50 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5f21658..9f8506d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3676,10 +3676,30 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, } /******* find_busiest_group() helpers end here *********************/ -/* - * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the amount of weighted load which - * should be moved to restore balance via the imbalance parameter. +/** + * find_busiest_group - Returns the busiest group within the sched_domain + * if there is an imbalance. If there isn't an imbalance, and + * the user has opted for power-savings, it returns a group whose + * CPUs can be put to idle by rebalancing those tasks elsewhere, if + * such a group exists. + * + * Also calculates the amount of weighted load which should be moved + * to restore balance. + * + * @sd: The sched_domain whose busiest group is to be returned. + * @this_cpu: The cpu for which load balancing is currently being performed. + * @imbalance: Variable which stores amount of weighted load which should + * be moved to restore balance/put a group to idle. + * @idle: The idle status of this_cpu. + * @sd_idle: The idleness of sd + * @cpus: The set of CPUs under consideration for load-balancing. + * @balance: Pointer to a variable indicating if this_cpu + * is the appropriate cpu to perform load balancing at this_level. + * + * Returns: - the busiest group if imbalance exists. + * - If no imbalance and user has opted for power-savings balance, + * return the least loaded group whose CPUs can be + * put to idle by rebalancing its tasks onto our group. */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, @@ -3697,17 +3717,31 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, balance, &sds); + /* Cases where imbalance does not exist from POV of this_cpu */ + /* 1) this_cpu is not the appropriate cpu to perform load balancing + * at this level. + * 2) There is no busy sibling group to pull from. + * 3) This group is the busiest group. + * 4) This group is more busy than the avg busieness at this + * sched_domain. + * 5) The imbalance is within the specified limit. + * 6) Any rebalance would lead to ping-pong + */ if (balance && !(*balance)) goto ret; - if (!sds.busiest || sds.this_load >= sds.max_load - || sds.busiest_nr_running == 0) + if (!sds.busiest || sds.busiest_nr_running == 0) + goto out_balanced; + + if (sds.this_load >= sds.max_load) goto out_balanced; sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; - if (sds.this_load >= sds.avg_load || - 100*sds.max_load <= sd->imbalance_pct * sds.this_load) + if (sds.this_load >= sds.avg_load) + goto out_balanced; + + if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) goto out_balanced; sds.busiest_load_per_task /= sds.busiest_nr_running; -- cgit v1.1 From 3ba13d179e8c24c68eac32b93593a6b10fcd1572 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Feb 2009 06:02:22 +0000 Subject: constify dentry_operations: rest Signed-off-by: Al Viro --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9edb5c4..b01100e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1627,7 +1627,7 @@ static struct inode_operations cgroup_dir_inode_operations = { static int cgroup_create_file(struct dentry *dentry, int mode, struct super_block *sb) { - static struct dentry_operations cgroup_dops = { + static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, }; -- cgit v1.1 From a3ec947c85ec339884b30ef6a08133e9311fdae1 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 4 Mar 2009 12:06:34 -0800 Subject: vfs: simple_set_mnt() should return void simple_set_mnt() is defined as returning 'int' but always returns 0. Callers assume simple_set_mnt() never fails and don't properly cleanup if it were to _ever_ fail. For instance, get_sb_single() and get_sb_nodev() should: up_write(sb->s_unmount); deactivate_super(sb); if simple_set_mnt() fails. Since simple_set_mnt() never fails, would be cleaner if it did not return anything. [akpm@linux-foundation.org: fix build] Signed-off-by: Sukadev Bhattiprolu Acked-by: Serge Hallyn Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- kernel/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b01100e..c500ca7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1071,7 +1071,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, mutex_unlock(&cgroup_mutex); } - return simple_set_mnt(mnt, sb); + simple_set_mnt(mnt, sb); + return 0; free_cg_links: free_cg_links(&tmp_cg_links); -- cgit v1.1 From 9710794383ee5008d67f1a6613a4717bf6de47bc Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 15 Mar 2009 11:11:44 -0700 Subject: async: remove the temporary (2.6.29) "async is off by default" code Now that everyone has been able to test the async code (and it's being used in the Moblin betas by default), we can enable it by default. The various fixes needed have gone into 2.6.29 already. [With an important bugfix from Stefan Richter] Signed-off-by: Arjan van de Ven --- kernel/async.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index f565891..968ef94 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -49,6 +49,7 @@ asynchronous and synchronous parts of the kernel. */ #include +#include #include #include #include @@ -387,20 +388,11 @@ static int async_manager_thread(void *unused) static int __init async_init(void) { - if (async_enabled) - if (IS_ERR(kthread_run(async_manager_thread, NULL, - "async/mgr"))) - async_enabled = 0; - return 0; -} + async_enabled = + !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr")); -static int __init setup_async(char *str) -{ - async_enabled = 1; - return 1; + WARN_ON(!async_enabled); + return 0; } -__setup("fastboot", setup_async); - - core_initcall(async_init); -- cgit v1.1 From d5ac537e5fb6fc12384c9f3ed6a15e912dfbbc2a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Mar 2009 21:52:47 -0700 Subject: sched: fix errors in struct & function comments Fix kernel-doc errors in sched.c: the structs don't have kernel-doc notation and the short function description needs to be one line only. Error(kernel/sched.c:3197): cannot understand prototype: 'struct sd_lb_stats ' Error(kernel/sched.c:3228): cannot understand prototype: 'struct sg_lb_stats ' Error(kernel/sched.c:3375): duplicate section name 'Description' Signed-off-by: Randy Dunlap cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sched.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f4c413b..5757e03 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3190,7 +3190,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } /********** Helpers for find_busiest_group ************************/ -/** +/* * sd_lb_stats - Structure to store the statistics of a sched_domain * during load balancing. */ @@ -3222,7 +3222,7 @@ struct sd_lb_stats { #endif }; -/** +/* * sg_lb_stats - stats of a sched_group required for load_balancing */ struct sg_lb_stats { @@ -3360,16 +3360,17 @@ static inline void update_sd_power_savings_stats(struct sched_group *group, } /** - * check_power_save_busiest_group - Check if we have potential to perform - * some power-savings balance. If yes, set the busiest group to be - * the least loaded group in the sched_domain, so that it's CPUs can - * be put to idle. - * + * check_power_save_busiest_group - see if there is potential for some power-savings balance * @sds: Variable containing the statistics of the sched_domain * under consideration. * @this_cpu: Cpu at which we're currently performing load-balancing. * @imbalance: Variable to store the imbalance. * + * Description: + * Check if we have potential to perform some power-savings balance. + * If yes, set the busiest group to be the least loaded group in the + * sched_domain, so that it's CPUs can be put to idle. + * * Returns 1 if there is potential to perform power-savings balance. * Else returns 0. */ -- cgit v1.1 From 1a2142afa5646ad5af44bbe1febaa5e0b7e71156 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 30 Mar 2009 22:05:10 -0600 Subject: cpumask: remove dangerous CPU_MASK_ALL_PTR, &CPU_MASK_ALL Impact: cleanup (Thanks to Al Viro for reminding me of this, via Ingo) CPU_MASK_ALL is the (deprecated) "all bits set" cpumask, defined as so: #define CPU_MASK_ALL (cpumask_t) { { ... } } Taking the address of such a temporary is questionable at best, unfortunately 321a8e9d (cpumask: add CPU_MASK_ALL_PTR macro) added CPU_MASK_ALL_PTR: #define CPU_MASK_ALL_PTR (&CPU_MASK_ALL) Which formalizes this practice. One day gcc could bite us over this usage (though we seem to have gotten away with it so far). So replace everywhere which used &CPU_MASK_ALL or CPU_MASK_ALL_PTR with the modern "cpu_all_mask" (a real const struct cpumask *). Signed-off-by: Rusty Russell Acked-by: Ingo Molnar Reported-by: Al Viro Cc: Mike Travis --- kernel/kmod.c | 2 +- kernel/kthread.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index a27a5f6..f0c8f54 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -167,7 +167,7 @@ static int ____call_usermodehelper(void *data) } /* We can run anywhere, unlike our parent keventd(). */ - set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR); + set_cpus_allowed_ptr(current, cpu_all_mask); /* * Our parent is keventd, which runs with elevated scheduling priority. diff --git a/kernel/kthread.c b/kernel/kthread.c index 4fbc456..84bbadd 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -110,7 +110,7 @@ static void create_kthread(struct kthread_create_info *create) */ sched_setscheduler(create->result, SCHED_NORMAL, ¶m); set_user_nice(create->result, KTHREAD_NICE_LEVEL); - set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR); + set_cpus_allowed_ptr(create->result, cpu_all_mask); } complete(&create->done); } @@ -240,7 +240,7 @@ int kthreadd(void *unused) set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_user_nice(tsk, KTHREAD_NICE_LEVEL); - set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR); + set_cpus_allowed_ptr(tsk, cpu_all_mask); current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; -- cgit v1.1 From 2b17fa506c418e9fb02bbbc7f426d2bbb5b247a6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 30 Mar 2009 22:05:12 -0600 Subject: cpumask: use set_cpu_active in init/main.c cpu_active_map is deprecated in favor of cpu_active_mask, which is const for safety: we use accessors now (set_cpu_active) is we really want to make a change. Signed-off-by: Rusty Russell --- kernel/cpu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 79e40f0..395b697 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -281,7 +281,7 @@ int __ref cpu_down(unsigned int cpu) goto out; } - cpu_clear(cpu, cpu_active_map); + set_cpu_active(cpu, false); /* * Make sure the all cpus did the reschedule and are not @@ -296,7 +296,7 @@ int __ref cpu_down(unsigned int cpu) err = _cpu_down(cpu, 0); if (cpu_online(cpu)) - cpu_set(cpu, cpu_active_map); + set_cpu_active(cpu, true); out: cpu_maps_update_done(); @@ -333,7 +333,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); - cpu_set(cpu, cpu_active_map); + set_cpu_active(cpu, true); /* Now call notifier in preparation. */ raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); -- cgit v1.1 From 9489424454c93f4d225d7af47978f8c7e84bf4d4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 30 Mar 2009 22:05:12 -0600 Subject: cpumask: use mm_cpumask() wrapper: kernel/fork.c Impact: futureproof Makes code futureproof against the impending change to mm->cpu_vm_mask. It's also a chance to use the new cpumask_ ops which take a pointer. Signed-off-by: Rusty Russell --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 6715ebc..47c1584 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -284,7 +284,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; - cpus_clear(mm->cpu_vm_mask); + cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; -- cgit v1.1 From aa85ea5b89c36c51200d795dd788139bd9b8cf50 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 30 Mar 2009 22:05:15 -0600 Subject: cpumask: use new cpumask_ functions in core code. Impact: cleanup Time to clean up remaining laggards using the old cpu_ functions. Signed-off-by: Rusty Russell Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Trond.Myklebust@netapp.com --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1f0c509..9aedd9f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -416,7 +416,7 @@ void flush_workqueue(struct workqueue_struct *wq) might_sleep(); lock_map_acquire(&wq->lockdep_map); lock_map_release(&wq->lockdep_map); - for_each_cpu_mask_nr(cpu, *cpu_map) + for_each_cpu(cpu, cpu_map) flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); } EXPORT_SYMBOL_GPL(flush_workqueue); @@ -547,7 +547,7 @@ static void wait_on_work(struct work_struct *work) wq = cwq->wq; cpu_map = wq_cpu_map(wq); - for_each_cpu_mask_nr(cpu, *cpu_map) + for_each_cpu(cpu, cpu_map) wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); } @@ -911,7 +911,7 @@ void destroy_workqueue(struct workqueue_struct *wq) list_del(&wq->list); spin_unlock(&workqueue_lock); - for_each_cpu_mask_nr(cpu, *cpu_map) + for_each_cpu(cpu, cpu_map) cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); cpu_maps_update_done(); -- cgit v1.1 From 73d0a4b107d58908305f272bfae9bd17f74a2c81 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 30 Mar 2009 22:05:16 -0600 Subject: cpumask: convert rcutorture.c We're getting rid of cpumasks on the stack. Simply change tmp_mask to a global, and allocate it in rcu_torture_init(). Signed-off-by: Rusty Russell Acked-by: "Paul E. McKenney" Cc: Josh Triplett --- kernel/rcutorture.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 7c4142a..9b4a975 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -126,6 +126,7 @@ static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; static long n_rcu_torture_timers = 0; static struct list_head rcu_torture_removed; +static cpumask_var_t shuffle_tmp_mask; static int stutter_pause_test = 0; @@ -889,10 +890,9 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ */ static void rcu_torture_shuffle_tasks(void) { - cpumask_t tmp_mask; int i; - cpus_setall(tmp_mask); + cpumask_setall(shuffle_tmp_mask); get_online_cpus(); /* No point in shuffling if there is only one online CPU (ex: UP) */ @@ -902,29 +902,29 @@ static void rcu_torture_shuffle_tasks(void) } if (rcu_idle_cpu != -1) - cpu_clear(rcu_idle_cpu, tmp_mask); + cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); - set_cpus_allowed_ptr(current, &tmp_mask); + set_cpus_allowed_ptr(current, shuffle_tmp_mask); if (reader_tasks) { for (i = 0; i < nrealreaders; i++) if (reader_tasks[i]) set_cpus_allowed_ptr(reader_tasks[i], - &tmp_mask); + shuffle_tmp_mask); } if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) if (fakewriter_tasks[i]) set_cpus_allowed_ptr(fakewriter_tasks[i], - &tmp_mask); + shuffle_tmp_mask); } if (writer_task) - set_cpus_allowed_ptr(writer_task, &tmp_mask); + set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); if (stats_task) - set_cpus_allowed_ptr(stats_task, &tmp_mask); + set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); if (rcu_idle_cpu == -1) rcu_idle_cpu = num_online_cpus() - 1; @@ -1012,6 +1012,7 @@ rcu_torture_cleanup(void) if (shuffler_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); kthread_stop(shuffler_task); + free_cpumask_var(shuffle_tmp_mask); } shuffler_task = NULL; @@ -1190,10 +1191,18 @@ rcu_torture_init(void) } if (test_no_idle_hz) { rcu_idle_cpu = num_online_cpus() - 1; + + if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { + firsterr = -ENOMEM; + VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); + goto unwind; + } + /* Create the shuffler thread */ shuffler_task = kthread_run(rcu_torture_shuffle, NULL, "rcu_torture_shuffle"); if (IS_ERR(shuffler_task)) { + free_cpumask_var(shuffle_tmp_mask); firsterr = PTR_ERR(shuffler_task); VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); shuffler_task = NULL; -- cgit v1.1 From 612a726faf8486fa48b34fa37115ce1e7421d383 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 30 Mar 2009 22:05:16 -0600 Subject: cpumask: remove cpumask_t from core Impact: cleanup struct cpumask is nicer, and we use it to make where we've made code safe for CONFIG_CPUMASK_OFFSTACK=y. Signed-off-by: Rusty Russell --- kernel/sched_cpupri.h | 2 +- kernel/stop_machine.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 642a94e..9a7e859 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -25,7 +25,7 @@ struct cpupri { #ifdef CONFIG_SMP int cpupri_find(struct cpupri *cp, - struct task_struct *p, cpumask_t *lowest_mask); + struct task_struct *p, struct cpumask *lowest_mask); void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp, bool bootmem); void cpupri_cleanup(struct cpupri *cp); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 74541ca..912823e 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -44,7 +44,7 @@ static DEFINE_MUTEX(setup_lock); static int refcount; static struct workqueue_struct *stop_machine_wq; static struct stop_machine_data active, idle; -static const cpumask_t *active_cpus; +static const struct cpumask *active_cpus; static void *stop_machine_work; static void set_state(enum stopmachine_state newstate) -- cgit v1.1 From 0a0c5168df270a50e3518e4f12bddb31f8f5f38f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 16 Mar 2009 22:33:49 +0100 Subject: PM: Introduce functions for suspending and resuming device interrupts Introduce helper functions allowing us to prevent device drivers from getting any interrupts (without disabling interrupts on the CPU) during suspend (or hibernation) and to make them start to receive interrupts again during the subsequent resume. These functions make it possible to keep timer interrupts enabled while the "late" suspend and "early" resume callbacks provided by device drivers are being executed. In turn, this allows device drivers' "late" suspend and "early" resume callbacks to sleep, execute ACPI callbacks etc. The functions introduced here will be used to rework the handling of interrupts during suspend (hibernation) and resume. Namely, interrupts will only be disabled on the CPU right before suspending sysdevs, while device drivers will be prevented from receiving interrupts, with the help of the new helper function, before their "late" suspend callbacks run (and analogously during resume). Signed-off-by: Rafael J. Wysocki Acked-by: Ingo Molnar --- kernel/irq/Makefile | 1 + kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 31 +++++++++++++++----- kernel/irq/pm.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 106 insertions(+), 7 deletions(-) create mode 100644 kernel/irq/pm.c (limited to 'kernel') diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 4dd5b1e..3394f8f 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o +obj-$(CONFIG_PM_SLEEP) += pm.o diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ee1aa9f..01ce20e 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -12,6 +12,8 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); +extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); +extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); extern struct lock_class_key irq_desc_lock_class; extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6458e99..1516ab7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -162,6 +162,20 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) } #endif +void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) +{ + if (suspend) { + if (!desc->action || (desc->action->flags & IRQF_TIMER)) + return; + desc->status |= IRQ_SUSPENDED; + } + + if (!desc->depth++) { + desc->status |= IRQ_DISABLED; + desc->chip->disable(irq); + } +} + /** * disable_irq_nosync - disable an irq without waiting * @irq: Interrupt to disable @@ -182,10 +196,7 @@ void disable_irq_nosync(unsigned int irq) return; spin_lock_irqsave(&desc->lock, flags); - if (!desc->depth++) { - desc->status |= IRQ_DISABLED; - desc->chip->disable(irq); - } + __disable_irq(desc, irq, false); spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL(disable_irq_nosync); @@ -215,15 +226,21 @@ void disable_irq(unsigned int irq) } EXPORT_SYMBOL(disable_irq); -static void __enable_irq(struct irq_desc *desc, unsigned int irq) +void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) { + if (resume) + desc->status &= ~IRQ_SUSPENDED; + switch (desc->depth) { case 0: + err_out: WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); break; case 1: { unsigned int status = desc->status & ~IRQ_DISABLED; + if (desc->status & IRQ_SUSPENDED) + goto err_out; /* Prevent probing on this irq: */ desc->status = status | IRQ_NOPROBE; check_irq_resend(desc, irq); @@ -253,7 +270,7 @@ void enable_irq(unsigned int irq) return; spin_lock_irqsave(&desc->lock, flags); - __enable_irq(desc, irq); + __enable_irq(desc, irq, false); spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL(enable_irq); @@ -511,7 +528,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { desc->status &= ~IRQ_SPURIOUS_DISABLED; - __enable_irq(desc, irq); + __enable_irq(desc, irq, false); } spin_unlock_irqrestore(&desc->lock, flags); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c new file mode 100644 index 0000000..638d8be --- /dev/null +++ b/kernel/irq/pm.c @@ -0,0 +1,79 @@ +/* + * linux/kernel/irq/pm.c + * + * Copyright (C) 2009 Rafael J. Wysocki , Novell Inc. + * + * This file contains power management functions related to interrupts. + */ + +#include +#include +#include + +#include "internals.h" + +/** + * suspend_device_irqs - disable all currently enabled interrupt lines + * + * During system-wide suspend or hibernation device interrupts need to be + * disabled at the chip level and this function is provided for this purpose. + * It disables all interrupt lines that are enabled at the moment and sets the + * IRQ_SUSPENDED flag for them. + */ +void suspend_device_irqs(void) +{ + struct irq_desc *desc; + int irq; + + for_each_irq_desc(irq, desc) { + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + __disable_irq(desc, irq, true); + spin_unlock_irqrestore(&desc->lock, flags); + } + + for_each_irq_desc(irq, desc) + if (desc->status & IRQ_SUSPENDED) + synchronize_irq(irq); +} +EXPORT_SYMBOL_GPL(suspend_device_irqs); + +/** + * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() + * + * Enable all interrupt lines previously disabled by suspend_device_irqs() that + * have the IRQ_SUSPENDED flag set. + */ +void resume_device_irqs(void) +{ + struct irq_desc *desc; + int irq; + + for_each_irq_desc(irq, desc) { + unsigned long flags; + + if (!(desc->status & IRQ_SUSPENDED)) + continue; + + spin_lock_irqsave(&desc->lock, flags); + __enable_irq(desc, irq, true); + spin_unlock_irqrestore(&desc->lock, flags); + } +} +EXPORT_SYMBOL_GPL(resume_device_irqs); + +/** + * check_wakeup_irqs - check if any wake-up interrupts are pending + */ +int check_wakeup_irqs(void) +{ + struct irq_desc *desc; + int irq; + + for_each_irq_desc(irq, desc) + if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) + return -EBUSY; + + return 0; +} -- cgit v1.1 From 2ed8d2b3a81bdbb0418301628ccdb008ac9f40b7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 16 Mar 2009 22:34:06 +0100 Subject: PM: Rework handling of interrupts during suspend-resume Use the functions introduced in by the previous patch, suspend_device_irqs(), resume_device_irqs() and check_wakeup_irqs(), to rework the handling of interrupts during suspend (hibernation) and resume. Namely, interrupts will only be disabled on the CPU right before suspending sysdevs, while device drivers will be prevented from receiving interrupts, with the help of the new helper function, before their "late" suspend callbacks run (and analogously during resume). In addition, since the device interrups are now disabled before the CPU has turned all interrupts off and the CPU will ACK the interrupts setting the IRQ_PENDING bit for them, check in sysdev_suspend() if any wake-up interrupts are pending and abort suspend if that's the case. Signed-off-by: Rafael J. Wysocki Acked-by: Ingo Molnar --- kernel/kexec.c | 8 ++++---- kernel/power/disk.c | 39 +++++++++++++++++++++++++++++---------- kernel/power/main.c | 17 +++++++++++------ 3 files changed, 44 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index c7fd669..dade9af 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1454,7 +1454,6 @@ int kernel_kexec(void) if (error) goto Resume_devices; device_pm_lock(); - local_irq_disable(); /* At this point, device_suspend() has been called, * but *not* device_power_down(). We *must* * device_power_down() now. Otherwise, drivers for @@ -1464,8 +1463,9 @@ int kernel_kexec(void) */ error = device_power_down(PMSG_FREEZE); if (error) - goto Enable_irqs; + goto Unlock_pm; + local_irq_disable(); /* Suspend system devices */ error = sysdev_suspend(PMSG_FREEZE); if (error) @@ -1484,9 +1484,9 @@ int kernel_kexec(void) if (kexec_image->preserve_context) { sysdev_resume(); Power_up_devices: - device_power_up(PMSG_RESTORE); - Enable_irqs: local_irq_enable(); + device_power_up(PMSG_RESTORE); + Unlock_pm: device_pm_unlock(); enable_nonboot_cpus(); Resume_devices: diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 4a4a206..320bb09 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -214,7 +214,7 @@ static int create_image(int platform_mode) return error; device_pm_lock(); - local_irq_disable(); + /* At this point, device_suspend() has been called, but *not* * device_power_down(). We *must* call device_power_down() now. * Otherwise, drivers for some devices (e.g. interrupt controllers) @@ -225,8 +225,11 @@ static int create_image(int platform_mode) if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); - goto Enable_irqs; + goto Unlock; } + + local_irq_disable(); + sysdev_suspend(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " @@ -252,12 +255,16 @@ static int create_image(int platform_mode) /* NOTE: device_power_up() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ + Power_up_devices: + local_irq_enable(); + device_power_up(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - Enable_irqs: - local_irq_enable(); + + Unlock: device_pm_unlock(); + return error; } @@ -336,13 +343,16 @@ static int resume_target_kernel(void) int error; device_pm_lock(); - local_irq_disable(); + error = device_power_down(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); - goto Enable_irqs; + goto Unlock; } + + local_irq_disable(); + sysdev_suspend(PMSG_QUIESCE); /* We'll ignore saved state, but this gets preempt count (etc) right */ save_processor_state(); @@ -366,11 +376,16 @@ static int resume_target_kernel(void) swsusp_free(); restore_processor_state(); touch_softlockup_watchdog(); + sysdev_resume(); - device_power_up(PMSG_RECOVER); - Enable_irqs: + local_irq_enable(); + + device_power_up(PMSG_RECOVER); + + Unlock: device_pm_unlock(); + return error; } @@ -447,15 +462,16 @@ int hibernation_platform_enter(void) goto Finish; device_pm_lock(); - local_irq_disable(); + error = device_power_down(PMSG_HIBERNATE); if (!error) { + local_irq_disable(); sysdev_suspend(PMSG_HIBERNATE); hibernation_ops->enter(); /* We should never get here */ while (1); } - local_irq_enable(); + device_pm_unlock(); /* @@ -464,12 +480,15 @@ int hibernation_platform_enter(void) */ Finish: hibernation_ops->finish(); + Resume_devices: entering_platform_hibernation = false; device_resume(PMSG_RESTORE); resume_console(); + Close: hibernation_ops->end(); + return error; } diff --git a/kernel/power/main.c b/kernel/power/main.c index c9632f8..f0a4667 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -287,17 +287,19 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void) */ static int suspend_enter(suspend_state_t state) { - int error = 0; + int error; device_pm_lock(); - arch_suspend_disable_irqs(); - BUG_ON(!irqs_disabled()); - if ((error = device_power_down(PMSG_SUSPEND))) { + error = device_power_down(PMSG_SUSPEND); + if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); goto Done; } + arch_suspend_disable_irqs(); + BUG_ON(!irqs_disabled()); + error = sysdev_suspend(PMSG_SUSPEND); if (!error) { if (!suspend_test(TEST_CORE)) @@ -305,11 +307,14 @@ static int suspend_enter(suspend_state_t state) sysdev_resume(); } - device_power_up(PMSG_RESUME); - Done: arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); + + device_power_up(PMSG_RESUME); + + Done: device_pm_unlock(); + return error; } -- cgit v1.1 From 900af0d973856d6feb6fc088c2d0d3fde57707d3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 16 Mar 2009 22:34:15 +0100 Subject: PM: Change suspend code ordering Change the ordering of the suspend core code so that the platform "prepare" callback is executed and the nonboot CPUs are disabled after calling device drivers' "late suspend" methods. This change will allow us to rework the PCI PM core so that the power state of devices is changed in the "late" phase of suspend (and analogously in the "early" phase of resume), which in turn will allow us to avoid the race condition where a device using shared interrupts is put into a low power state with interrupts enabled and then an interrupt (for another device) comes in and confuses its driver. Signed-off-by: Rafael J. Wysocki Acked-by: Ingo Molnar --- kernel/power/main.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index f0a4667..f172f41 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -297,6 +297,19 @@ static int suspend_enter(suspend_state_t state) goto Done; } + if (suspend_ops->prepare) { + error = suspend_ops->prepare(); + if (error) + goto Power_up_devices; + } + + if (suspend_test(TEST_PLATFORM)) + goto Platfrom_finish; + + error = disable_nonboot_cpus(); + if (error || suspend_test(TEST_CPUS)) + goto Enable_cpus; + arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); @@ -310,6 +323,14 @@ static int suspend_enter(suspend_state_t state) arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); + Enable_cpus: + enable_nonboot_cpus(); + + Platfrom_finish: + if (suspend_ops->finish) + suspend_ops->finish(); + + Power_up_devices: device_power_up(PMSG_RESUME); Done: @@ -346,23 +367,8 @@ int suspend_devices_and_enter(suspend_state_t state) if (suspend_test(TEST_DEVICES)) goto Recover_platform; - if (suspend_ops->prepare) { - error = suspend_ops->prepare(); - if (error) - goto Resume_devices; - } - - if (suspend_test(TEST_PLATFORM)) - goto Finish; + suspend_enter(state); - error = disable_nonboot_cpus(); - if (!error && !suspend_test(TEST_CPUS)) - suspend_enter(state); - - enable_nonboot_cpus(); - Finish: - if (suspend_ops->finish) - suspend_ops->finish(); Resume_devices: suspend_test_start(); device_resume(PMSG_RESUME); -- cgit v1.1 From 4aecd6718939eb3c4145b248369b65f7483a8a02 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 16 Mar 2009 22:34:26 +0100 Subject: PM: Change hibernation code ordering Change the ordering of the hibernation core code so that the platform "prepare" callbacks are executed and the nonboot CPUs are disabled after calling device drivers' "late suspend" methods. This change (along with the previous analogous change of the suspend core code) will allow us to rework the PCI PM core so that the power state of devices is changed in the "late" phase of suspend (and analogously in the "early" phase of resume), which in turn will allow us to avoid the race condition where a device using shared interrupts is put into a low power state with interrupts enabled and then an interrupt (for another device) comes in and confuses its driver. Signed-off-by: Rafael J. Wysocki Acked-by: Ingo Molnar --- kernel/power/disk.c | 109 +++++++++++++++++++++++++++++----------------------- 1 file changed, 61 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 320bb09..e886d13 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -228,13 +228,22 @@ static int create_image(int platform_mode) goto Unlock; } + error = platform_pre_snapshot(platform_mode); + if (error || hibernation_test(TEST_PLATFORM)) + goto Platform_finish; + + error = disable_nonboot_cpus(); + if (error || hibernation_test(TEST_CPUS) + || hibernation_testmode(HIBERNATION_TEST)) + goto Enable_cpus; + local_irq_disable(); sysdev_suspend(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); - goto Power_up_devices; + goto Enable_irqs; } if (hibernation_test(TEST_CORE)) @@ -250,15 +259,22 @@ static int create_image(int platform_mode) restore_processor_state(); if (!in_suspend) platform_leave(platform_mode); + Power_up: sysdev_resume(); /* NOTE: device_power_up() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ - Power_up_devices: + Enable_irqs: local_irq_enable(); + Enable_cpus: + enable_nonboot_cpus(); + + Platform_finish: + platform_finish(platform_mode); + device_power_up(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); @@ -298,25 +314,9 @@ int hibernation_snapshot(int platform_mode) if (hibernation_test(TEST_DEVICES)) goto Recover_platform; - error = platform_pre_snapshot(platform_mode); - if (error || hibernation_test(TEST_PLATFORM)) - goto Finish; - - error = disable_nonboot_cpus(); - if (!error) { - if (hibernation_test(TEST_CPUS)) - goto Enable_cpus; - - if (hibernation_testmode(HIBERNATION_TEST)) - goto Enable_cpus; + error = create_image(platform_mode); + /* Control returns here after successful restore */ - error = create_image(platform_mode); - /* Control returns here after successful restore */ - } - Enable_cpus: - enable_nonboot_cpus(); - Finish: - platform_finish(platform_mode); Resume_devices: device_resume(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); @@ -338,7 +338,7 @@ int hibernation_snapshot(int platform_mode) * kernel. */ -static int resume_target_kernel(void) +static int resume_target_kernel(bool platform_mode) { int error; @@ -351,9 +351,20 @@ static int resume_target_kernel(void) goto Unlock; } + error = platform_pre_restore(platform_mode); + if (error) + goto Cleanup; + + error = disable_nonboot_cpus(); + if (error) + goto Enable_cpus; + local_irq_disable(); - sysdev_suspend(PMSG_QUIESCE); + error = sysdev_suspend(PMSG_QUIESCE); + if (error) + goto Enable_irqs; + /* We'll ignore saved state, but this gets preempt count (etc) right */ save_processor_state(); error = restore_highmem(); @@ -379,8 +390,15 @@ static int resume_target_kernel(void) sysdev_resume(); + Enable_irqs: local_irq_enable(); + Enable_cpus: + enable_nonboot_cpus(); + + Cleanup: + platform_restore_cleanup(platform_mode); + device_power_up(PMSG_RECOVER); Unlock: @@ -405,19 +423,10 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); error = device_suspend(PMSG_QUIESCE); - if (error) - goto Finish; - - error = platform_pre_restore(platform_mode); if (!error) { - error = disable_nonboot_cpus(); - if (!error) - error = resume_target_kernel(); - enable_nonboot_cpus(); + error = resume_target_kernel(platform_mode); + device_resume(PMSG_RECOVER); } - platform_restore_cleanup(platform_mode); - device_resume(PMSG_RECOVER); - Finish: resume_console(); pm_restore_console(); return error; @@ -453,34 +462,38 @@ int hibernation_platform_enter(void) goto Resume_devices; } + device_pm_lock(); + + error = device_power_down(PMSG_HIBERNATE); + if (error) + goto Unlock; + error = hibernation_ops->prepare(); if (error) - goto Resume_devices; + goto Platofrm_finish; error = disable_nonboot_cpus(); if (error) - goto Finish; - - device_pm_lock(); - - error = device_power_down(PMSG_HIBERNATE); - if (!error) { - local_irq_disable(); - sysdev_suspend(PMSG_HIBERNATE); - hibernation_ops->enter(); - /* We should never get here */ - while (1); - } + goto Platofrm_finish; - device_pm_unlock(); + local_irq_disable(); + sysdev_suspend(PMSG_HIBERNATE); + hibernation_ops->enter(); + /* We should never get here */ + while (1); /* * We don't need to reenable the nonboot CPUs or resume consoles, since * the system is going to be halted anyway. */ - Finish: + Platofrm_finish: hibernation_ops->finish(); + device_power_up(PMSG_RESTORE); + + Unlock: + device_pm_unlock(); + Resume_devices: entering_platform_hibernation = false; device_resume(PMSG_RESTORE); -- cgit v1.1 From 749b0afc3a9d90dda3319fd1464a3b32fc225361 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 16 Mar 2009 22:34:35 +0100 Subject: kexec: Change kexec jump code ordering Change the ordering of the kexec jump code so that the nonboot CPUs are disabled after calling device drivers' "late suspend" methods. This change reflects the recent modifications of the power management code that is also used by kexec jump. Signed-off-by: Rafael J. Wysocki Acked-by: Ingo Molnar --- kernel/kexec.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index dade9af..93eed85 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1450,9 +1450,6 @@ int kernel_kexec(void) error = device_suspend(PMSG_FREEZE); if (error) goto Resume_console; - error = disable_nonboot_cpus(); - if (error) - goto Resume_devices; device_pm_lock(); /* At this point, device_suspend() has been called, * but *not* device_power_down(). We *must* @@ -1463,13 +1460,15 @@ int kernel_kexec(void) */ error = device_power_down(PMSG_FREEZE); if (error) - goto Unlock_pm; - + goto Resume_devices; + error = disable_nonboot_cpus(); + if (error) + goto Enable_cpus; local_irq_disable(); /* Suspend system devices */ error = sysdev_suspend(PMSG_FREEZE); if (error) - goto Power_up_devices; + goto Enable_irqs; } else #endif { @@ -1483,13 +1482,13 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { sysdev_resume(); - Power_up_devices: + Enable_irqs: local_irq_enable(); - device_power_up(PMSG_RESTORE); - Unlock_pm: - device_pm_unlock(); + Enable_cpus: enable_nonboot_cpus(); + device_power_up(PMSG_RESTORE); Resume_devices: + device_pm_unlock(); device_resume(PMSG_RESTORE); Resume_console: resume_console(); -- cgit v1.1 From 2f8501815256af8498904e68bd0984b1afffd6f8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Mar 2009 11:13:20 +0100 Subject: lockdep: fix deadlock in lockdep_trace_alloc Heiko reported that we grab the graph lock with irqs enabled. Fix this by providng the same wrapper as all other lockdep entry functions have. Reported-by: Heiko Carstens Signed-off-by: Peter Zijlstra Cc: Nick Piggin LKML-Reference: <1237544000.24626.52.camel@twins> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 022d2ed..3673a3f 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2260,7 +2260,7 @@ void trace_softirqs_off(unsigned long ip) debug_atomic_inc(&redundant_softirqs_off); } -void lockdep_trace_alloc(gfp_t gfp_mask) +static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) { struct task_struct *curr = current; @@ -2279,12 +2279,29 @@ void lockdep_trace_alloc(gfp_t gfp_mask) if (!(gfp_mask & __GFP_FS)) return; - if (DEBUG_LOCKS_WARN_ON(irqs_disabled())) + if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) return; mark_held_locks(curr, RECLAIM_FS); } +static void check_flags(unsigned long flags); + +void lockdep_trace_alloc(gfp_t gfp_mask) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + __lockdep_trace_alloc(gfp_mask, flags); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} + static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) { /* -- cgit v1.1