From 72b1e22dfcad1daca6906148fd956ffe404bb0bc Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:16:45 -0700 Subject: x64, x2apic/intr-remap: generic irq migration support from process context Generic infrastructure for migrating the irq from the process context in the presence of CONFIG_GENERIC_PENDING_IRQ. This will be used later for migrating irq in the presence of interrupt-remapping. Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 46d6611..628b557 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -87,7 +87,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) set_balance_irq_affinity(irq, cpumask); #ifdef CONFIG_GENERIC_PENDING_IRQ - set_pending_irq(irq, cpumask); + if (desc->status & IRQ_MOVE_PCNTXT) { + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + desc->chip->set_affinity(irq, cpumask); + spin_unlock_irqrestore(&desc->lock, flags); + } else + set_pending_irq(irq, cpumask); #else desc->affinity = cpumask; desc->chip->set_affinity(irq, cpumask); -- cgit v1.1 From 3cac97cbb14aed00d83eb33d4613b0fe3aaea863 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 6 Jul 2008 17:23:55 +0800 Subject: rcu classic: simplify the next pending batch use a batch number(rcp->pending) instead of a flag(rcp->next_pending) rcu_start_batch() need to change this flag, so mb()s is needed for memory-access safe. but(after this patch applied) rcu_start_batch() do not change this batch number(rcp->pending), rcp->pending is managed by __rcu_process_callbacks only, and troublesome mb()s are eliminated. And codes look simpler and clearer. Signed-off-by: Lai Jiangshan Cc: "Paul E. McKenney" Cc: Dipankar Sarma Cc: Gautham Shenoy Cc: Dhaval Giani Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 16eeeaa..03726eb 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -60,12 +60,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map); static struct rcu_ctrlblk rcu_ctrlblk = { .cur = -300, .completed = -300, + .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, + .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; @@ -276,14 +278,8 @@ static void rcu_do_batch(struct rcu_data *rdp) */ static void rcu_start_batch(struct rcu_ctrlblk *rcp) { - if (rcp->next_pending && + if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { - rcp->next_pending = 0; - /* - * next_pending == 0 must be visible in - * __rcu_process_callbacks() before it can see new value of cur. - */ - smp_wmb(); rcp->cur++; /* @@ -441,16 +437,14 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, /* determine batch number */ rdp->batch = rcp->cur + 1; - /* see the comment and corresponding wmb() in - * the rcu_start_batch() - */ - smp_rmb(); - if (!rcp->next_pending) { + if (rcu_batch_after(rdp->batch, rcp->pending)) { /* and start it/schedule start if it's a new batch */ spin_lock(&rcp->lock); - rcp->next_pending = 1; - rcu_start_batch(rcp); + if (rcu_batch_after(rdp->batch, rcp->pending)) { + rcp->pending = rdp->batch; + rcu_start_batch(rcp); + } spin_unlock(&rcp->lock); } } -- cgit v1.1 From 5127bed588a2f8f3a1f732de2a8a190b7df5dce3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 6 Jul 2008 17:23:59 +0800 Subject: rcu classic: new algorithm for callbacks-processing(v2) This is v2, it's a little deference from v1 that I had send to lkml. use ACCESS_ONCE use rcu_batch_after/rcu_batch_before for batch # comparison. rcutorture test result: (hotplugs: do cpu-online/offline once per second) No CONFIG_NO_HZ: OK, 12hours No CONFIG_NO_HZ, hotplugs: OK, 12hours CONFIG_NO_HZ=y: OK, 24hours CONFIG_NO_HZ=y, hotplugs: Failed. (Failed also without my patch applied, exactly the same bug occurred, http://lkml.org/lkml/2008/7/3/24) v1's email thread: http://lkml.org/lkml/2008/6/2/539 v1's description: The code/algorithm of the implement of current callbacks-processing is very efficient and technical. But when I studied it and I found a disadvantage: In multi-CPU systems, when a new RCU callback is being queued(call_rcu[_bh]), this callback will be invoked after the grace period for the batch with batch number = rcp->cur+2 has completed very very likely in current implement. Actually, this callback can be invoked after the grace period for the batch with batch number = rcp->cur+1 has completed. The delay of invocation means that latency of synchronize_rcu() is extended. But more important thing is that the callbacks usually free memory, and these works are delayed too! it's necessary for reclaimer to free memory as soon as possible when left memory is few. A very simple way can solve this problem: a field(struct rcu_head::batch) is added to record the batch number for the RCU callback. And when a new RCU callback is being queued, we determine the batch number for this callback(head->batch = rcp->cur+1) and we move this callback to rdp->donelist if we find that head->batch <= rcp->completed when we process callbacks. This simple way reduces the wait time for invocation a lot. (about 2.5Grace Period -> 1.5Grace Period in average in multi-CPU systems) This is my algorithm. But I do not add any field for struct rcu_head in my implement. We just need to memorize the last 2 batches and their batch number, because these 2 batches include all entries that for whom the grace period hasn't completed. So we use a special linked-list rather than add a field. Please see the comment of struct rcu_data. Signed-off-by: Lai Jiangshan Cc: "Paul E. McKenney" Cc: Dipankar Sarma Cc: Gautham Shenoy Cc: Dhaval Giani Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 157 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 97 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 03726eb..d3553ee 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -120,6 +120,43 @@ static inline void force_quiescent_state(struct rcu_data *rdp, } #endif +static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + long batch; + smp_mb(); /* reads the most recently updated value of rcu->cur. */ + + /* + * Determine the batch number of this callback. + * + * Using ACCESS_ONCE to avoid the following error when gcc eliminates + * local variable "batch" and emits codes like this: + * 1) rdp->batch = rcp->cur + 1 # gets old value + * ...... + * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value + * then [*nxttail[0], *nxttail[1]) may contain callbacks + * that batch# = rdp->batch, see the comment of struct rcu_data. + */ + batch = ACCESS_ONCE(rcp->cur) + 1; + + if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) { + /* process callbacks */ + rdp->nxttail[0] = rdp->nxttail[1]; + rdp->nxttail[1] = rdp->nxttail[2]; + if (rcu_batch_after(batch - 1, rdp->batch)) + rdp->nxttail[0] = rdp->nxttail[2]; + } + + rdp->batch = batch; + *rdp->nxttail[2] = head; + rdp->nxttail[2] = &head->next; + + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_ctrlblk); + } +} + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -135,18 +172,11 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; - struct rcu_data *rdp; head->func = func; head->next = NULL; local_irq_save(flags); - rdp = &__get_cpu_var(rcu_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_ctrlblk); - } + __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data)); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu); @@ -171,20 +201,11 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; - struct rcu_data *rdp; head->func = func; head->next = NULL; local_irq_save(flags); - rdp = &__get_cpu_var(rcu_bh_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_bh_ctrlblk); - } - + __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -213,12 +234,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); static inline void raise_rcu_softirq(void) { raise_softirq(RCU_SOFTIRQ); - /* - * The smp_mb() here is required to ensure that this cpu's - * __rcu_process_callbacks() reads the most recently updated - * value of rcu->cur. - */ - smp_mb(); } /* @@ -360,13 +375,15 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, * which is dead and hence not processing interrupts. */ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, - struct rcu_head **tail) + struct rcu_head **tail, long batch) { - local_irq_disable(); - *this_rdp->nxttail = list; - if (list) - this_rdp->nxttail = tail; - local_irq_enable(); + if (list) { + local_irq_disable(); + this_rdp->batch = batch; + *this_rdp->nxttail[2] = list; + this_rdp->nxttail[2] = tail; + local_irq_enable(); + } } static void __rcu_offline_cpu(struct rcu_data *this_rdp, @@ -380,9 +397,9 @@ static void __rcu_offline_cpu(struct rcu_data *this_rdp, if (rcp->cur != rcp->completed) cpu_quiet(rdp->cpu, rcp); spin_unlock_bh(&rcp->lock); - rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); - rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); - rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); + /* spin_lock implies smp_mb() */ + rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); + rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); local_irq_disable(); this_rdp->qlen += rdp->qlen; @@ -416,27 +433,37 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { - *rdp->donetail = rdp->curlist; - rdp->donetail = rdp->curtail; - rdp->curlist = NULL; - rdp->curtail = &rdp->curlist; - } - - if (rdp->nxtlist && !rdp->curlist) { + if (rdp->nxtlist) { local_irq_disable(); - rdp->curlist = rdp->nxtlist; - rdp->curtail = rdp->nxttail; - rdp->nxtlist = NULL; - rdp->nxttail = &rdp->nxtlist; - local_irq_enable(); /* - * start the next batch of callbacks + * move the other grace-period-completed entries to + * [rdp->nxtlist, *rdp->nxttail[0]) temporarily + */ + if (!rcu_batch_before(rcp->completed, rdp->batch)) + rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2]; + else if (!rcu_batch_before(rcp->completed, rdp->batch - 1)) + rdp->nxttail[0] = rdp->nxttail[1]; + + /* + * the grace period for entries in + * [rdp->nxtlist, *rdp->nxttail[0]) has completed and + * move these entries to donelist */ + if (rdp->nxttail[0] != &rdp->nxtlist) { + *rdp->donetail = rdp->nxtlist; + rdp->donetail = rdp->nxttail[0]; + rdp->nxtlist = *rdp->nxttail[0]; + *rdp->donetail = NULL; + + if (rdp->nxttail[1] == rdp->nxttail[0]) + rdp->nxttail[1] = &rdp->nxtlist; + if (rdp->nxttail[2] == rdp->nxttail[0]) + rdp->nxttail[2] = &rdp->nxtlist; + rdp->nxttail[0] = &rdp->nxtlist; + } - /* determine batch number */ - rdp->batch = rcp->cur + 1; + local_irq_enable(); if (rcu_batch_after(rdp->batch, rcp->pending)) { /* and start it/schedule start if it's a new batch */ @@ -462,15 +489,26 @@ static void rcu_process_callbacks(struct softirq_action *unused) static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* This cpu has pending rcu entries and the grace period - * for them has completed. - */ - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) - return 1; + if (rdp->nxtlist) { + /* + * This cpu has pending rcu entries and the grace period + * for them has completed. + */ + if (!rcu_batch_before(rcp->completed, rdp->batch)) + return 1; + if (!rcu_batch_before(rcp->completed, rdp->batch - 1) && + rdp->nxttail[0] != rdp->nxttail[1]) + return 1; + if (rdp->nxttail[0] != &rdp->nxtlist) + return 1; - /* This cpu has no pending entries, but there are new entries */ - if (!rdp->curlist && rdp->nxtlist) - return 1; + /* + * This cpu has pending rcu entries and the new batch + * for then hasn't been started nor scheduled start + */ + if (rcu_batch_after(rdp->batch, rcp->pending)) + return 1; + } /* This cpu has finished callbacks to invoke */ if (rdp->donelist) @@ -506,7 +544,7 @@ int rcu_needs_cpu(int cpu) struct rcu_data *rdp = &per_cpu(rcu_data, cpu); struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); - return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); + return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu); } void rcu_check_callbacks(int cpu, int user) @@ -553,8 +591,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { memset(rdp, 0, sizeof(*rdp)); - rdp->curtail = &rdp->curlist; - rdp->nxttail = &rdp->nxtlist; + rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; rdp->donetail = &rdp->donelist; rdp->quiescbatch = rcp->completed; rdp->qs_pending = 0; -- cgit v1.1 From 67182ae1c42206e516f7efb292b745e826497b24 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 10 Aug 2008 18:35:38 -0700 Subject: rcu, debug: detect stalled grace periods this is a diagnostic patch for Classic RCU. The approach is to record a timestamp at the beginning of the grace period (in rcu_start_batch()), then have rcu_check_callbacks() complain if: 1. it is running on a CPU that has holding up grace periods for a long time (say one second). This will identify the culprit assuming that the culprit has not disabled hardware irqs, instruction execution, or some such. 2. it is running on a CPU that is not holding up grace periods, but grace periods have been held up for an even longer time (say two seconds). It is enabled via the default-off CONFIG_DEBUG_RCU_STALL kernel parameter. Rather than exponential backoff, it backs off to once per 30 seconds. My feeling upon thinking on it was that if you have stalled RCU grace periods for that long, a few extra printk() messages are probably the least of your worries... Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Yinghai Lu Cc: David Witbrodt Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index d427114..d7ec731 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -286,6 +287,81 @@ static void rcu_do_batch(struct rcu_data *rdp) * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace * period (if necessary). */ + +#ifdef CONFIG_DEBUG_RCU_STALL + +static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_check = get_seconds() + 3; +} +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock(&rcp->lock); + delta = get_seconds() - rcp->gp_check; + if (delta < 2L || + cpus_empty(rcp->cpumask)) { + spin_unlock(&rcp->lock); + return; + rcp->gp_check = get_seconds() + 30; + } + spin_unlock(&rcp->lock); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_cpu_mask(cpu, rcp->cpumask) + printk(" %d", cpu); + printk(" (detected by %d, t=%lu/%lu)\n", + smp_processor_id(), get_seconds(), rcp->gp_check); +} +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", + smp_processor_id(), get_seconds(), rcp->gp_check); + dump_stack(); + spin_lock(&rcp->lock); + if ((long)(get_seconds() - rcp->gp_check) >= 0L) + rcp->gp_check = get_seconds() + 30; + spin_unlock(&rcp->lock); +} +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + long delta; + + delta = get_seconds() - rcp->gp_check; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) { + + /* We haven't checked in, so go dump stack. */ + + print_cpu_stall(rcp); + + } else if (!cpus_empty(rcp->cpumask) && delta >= 2L) { + + /* They had two seconds to dump stack, so complain. */ + + print_other_cpu_stall(rcp); + + } +} + +#else /* #ifdef CONFIG_DEBUG_RCU_STALL */ + +static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) +{ +} +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */ + /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. @@ -296,6 +372,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { rcp->cur++; + record_gp_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -489,6 +566,9 @@ static void rcu_process_callbacks(struct softirq_action *unused) static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + /* Check for CPU stalls, if enabled. */ + check_cpu_stall(rcp, rdp); + if (rdp->nxtlist) { /* * This cpu has pending rcu entries and the grace period -- cgit v1.1 From 78635fc739b1254f3e0362ac430edbdd2cff01dc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 11 Aug 2008 13:34:15 +0200 Subject: rcu, debug: detect stalled grace periods, cleanups small cleanups. Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index d7ec731..56b8712 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -294,6 +294,7 @@ static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) { rcp->gp_check = get_seconds() + 3; } + static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) { int cpu; @@ -303,11 +304,9 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) spin_lock(&rcp->lock); delta = get_seconds() - rcp->gp_check; - if (delta < 2L || - cpus_empty(rcp->cpumask)) { + if (delta < 2L || cpus_empty(rcp->cpumask)) { spin_unlock(&rcp->lock); return; - rcp->gp_check = get_seconds() + 30; } spin_unlock(&rcp->lock); @@ -319,6 +318,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) printk(" (detected by %d, t=%lu/%lu)\n", smp_processor_id(), get_seconds(), rcp->gp_check); } + static void print_cpu_stall(struct rcu_ctrlblk *rcp) { printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", @@ -329,8 +329,8 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp) rcp->gp_check = get_seconds() + 30; spin_unlock(&rcp->lock); } -static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) + +static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { long delta; @@ -341,12 +341,11 @@ static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, print_cpu_stall(rcp); - } else if (!cpus_empty(rcp->cpumask) && delta >= 2L) { - - /* They had two seconds to dump stack, so complain. */ - - print_other_cpu_stall(rcp); - + } else { + if (!cpus_empty(rcp->cpumask) && delta >= 2L) { + /* They had two seconds to dump stack, so complain. */ + print_other_cpu_stall(rcp); + } } } @@ -355,8 +354,9 @@ static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) { } -static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) + +static inline void +check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { } -- cgit v1.1 From b845b517b5e3706a3729f6ea83b88ab85f0725b0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Aug 2008 21:47:09 +0200 Subject: printk: robustify printk Avoid deadlocks against rq->lock and xtime_lock by deferring the klogd wakeup by polling from the timer tick. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/printk.c | 19 +++++++++++++++++-- kernel/time/tick-sched.c | 2 +- kernel/timer.c | 1 + 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index b51b156..655cc2c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -982,10 +982,25 @@ int is_console_locked(void) return console_locked; } -void wake_up_klogd(void) +static DEFINE_PER_CPU(int, printk_pending); + +void printk_tick(void) { - if (!oops_in_progress && waitqueue_active(&log_wait)) + if (__get_cpu_var(printk_pending)) { + __get_cpu_var(printk_pending) = 0; wake_up_interruptible(&log_wait); + } +} + +int printk_needs_cpu(int cpu) +{ + return per_cpu(printk_pending, cpu); +} + +void wake_up_klogd(void) +{ + if (waitqueue_active(&log_wait)) + __get_cpu_var(printk_pending) = 1; } /** diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 825b4c0..c13d4f1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -255,7 +255,7 @@ void tick_nohz_stop_sched_tick(int inidle) next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; - if (rcu_needs_cpu(cpu)) + if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu)) delta_jiffies = 1; /* * Do not stop the tick, if we are only one off diff --git a/kernel/timer.c b/kernel/timer.c index 03bc7f1..510fe69 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -978,6 +978,7 @@ void update_process_times(int user_tick) run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); + printk_tick(); scheduler_tick(); run_posix_cpu_timers(p); } -- cgit v1.1 From cf417141cbb3a4ceb5cca15b2c1f099bd0a6603c Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Mon, 11 Aug 2008 14:33:53 -0700 Subject: sched, cpuset: rework sched domains and CPU hotplug handling (v4) This is an updated version of my previous cpuset patch on top of the latest mainline git. The patch fixes CPU hotplug handling issues in the current cpusets code. Namely circular locking in rebuild_sched_domains() and unsafe access to the cpu_online_map in the cpuset cpu hotplug handler. This version includes changes suggested by Paul Jackson (naming, comments, style, etc). I also got rid of the separate workqueue thread because it is now safe to call get_online_cpus() from workqueue callbacks. Here are some more details: rebuild_sched_domains() is the only way to rebuild sched domains correctly based on the current cpuset settings. What this means is that we need to be able to call it from different contexts, like cpu hotplug for example. Also latest scheduler code in -tip now calls rebuild_sched_domains() directly from functions like arch_reinit_sched_domains(). In order to support that properly we need to rework cpuset locking rules to avoid circular dependencies, which is what this patch does. New lock nesting rules are explained in the comments. We can now safely call rebuild_sched_domains() from virtually any context. The only requirement is that it needs to be called under get_online_cpus(). This allows cpu hotplug handlers and the scheduler to call rebuild_sched_domains() directly. The rest of the cpuset code now offloads sched domains rebuilds to a workqueue (async_rebuild_sched_domains()). This version of the patch addresses comments from the previous review. I fixed all miss-formated comments and trailing spaces. I also factored out the code that builds domain masks and split up CPU and memory hotplug handling. This was needed to simplify locking, to avoid unsafe access to the cpu_online_map from mem hotplug handler, and in general to make things cleaner. The patch passes moderate testing (building kernel with -j 16, creating & removing domains and bringing cpus off/online at the same time) on the quad-core2 based machine. It passes lockdep checks, even with preemptable RCU enabled. This time I also tested in with suspend/resume path and everything is working as expected. Signed-off-by: Max Krasnyansky Acked-by: Paul Jackson Cc: menage@google.com Cc: a.p.zijlstra@chello.nl Cc: vegard.nossum@gmail.com Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 312 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 182 insertions(+), 130 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d5ab79c..f227bc1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -14,6 +14,8 @@ * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * 2006 Rework by Paul Menage to use generic cgroups + * 2008 Rework of the scheduler domains and CPU hotplug handling + * by Max Krasnyansky * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux @@ -236,9 +238,11 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(callback_mutex); -/* This is ugly, but preserves the userspace API for existing cpuset +/* + * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we - * silently switch it to mount "cgroup" instead */ + * silently switch it to mount "cgroup" instead + */ static int cpuset_get_sb(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data, struct vfsmount *mnt) @@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) } /* - * Helper routine for rebuild_sched_domains(). + * Helper routine for generate_sched_domains(). * Do cpusets a, b have overlapping cpus_allowed masks? */ - static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { return cpus_intersects(a->cpus_allowed, b->cpus_allowed); @@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) } /* - * rebuild_sched_domains() - * - * This routine will be called to rebuild the scheduler's dynamic - * sched domains: - * - if the flag 'sched_load_balance' of any cpuset with non-empty - * 'cpus' changes, - * - or if the 'cpus' allowed changes in any cpuset which has that - * flag enabled, - * - or if the 'sched_relax_domain_level' of any cpuset which has - * that flag enabled and with non-empty 'cpus' changes, - * - or if any cpuset with non-empty 'cpus' is removed, - * - or if a cpu gets offlined. - * - * This routine builds a partial partition of the systems CPUs - * (the set of non-overlappping cpumask_t's in the array 'part' - * below), and passes that partial partition to the kernel/sched.c - * partition_sched_domains() routine, which will rebuild the - * schedulers load balancing domains (sched domains) as specified - * by that partial partition. A 'partial partition' is a set of - * non-overlapping subsets whose union is a subset of that set. + * generate_sched_domains() + * + * This function builds a partial partition of the systems CPUs + * A 'partial partition' is a set of non-overlapping subsets whose + * union is a subset of that set. + * The output of this function needs to be passed to kernel/sched.c + * partition_sched_domains() routine, which will rebuild the scheduler's + * load balancing domains (sched domains) as specified by that partial + * partition. * * See "What is sched_load_balance" in Documentation/cpusets.txt * for a background explanation of this. @@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Call with cgroup_mutex held. May take callback_mutex during - * call due to the kfifo_alloc() and kmalloc() calls. May nest - * a call to the get_online_cpus()/put_online_cpus() pair. - * Must not be called holding callback_mutex, because we must not - * call get_online_cpus() while holding callback_mutex. Elsewhere - * the kernel nests callback_mutex inside get_online_cpus() calls. - * So the reverse nesting would risk an ABBA deadlock. + * Must be called with cgroup_lock held. * * The three key local variables below are: * q - a linked-list queue of cpuset pointers, used to implement a @@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ - -void rebuild_sched_domains(void) +static int generate_sched_domains(cpumask_t **domains, + struct sched_domain_attr **attributes) { - LIST_HEAD(q); /* queue of cpusets to be scanned*/ + LIST_HEAD(q); /* queue of cpusets to be scanned */ struct cpuset *cp; /* scans q */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ @@ -601,23 +587,26 @@ void rebuild_sched_domains(void) int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ - csa = NULL; + ndoms = 0; doms = NULL; dattr = NULL; + csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { - ndoms = 1; doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms) - goto rebuild; + goto done; + dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); if (dattr) { *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } *doms = top_cpuset.cpus_allowed; - goto rebuild; + + ndoms = 1; + goto done; } csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); @@ -680,61 +669,141 @@ restart: } } - /* Convert to */ + /* + * Now we know how many domains to create. + * Convert to and populate cpu masks. + */ doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); - if (!doms) - goto rebuild; + if (!doms) { + ndoms = 0; + goto done; + } + + /* + * The rest of the code, including the scheduler, can deal with + * dattr==NULL case. No need to abort if alloc fails. + */ dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; + cpumask_t *dp; int apn = a->pn; - if (apn >= 0) { - cpumask_t *dp = doms + nslot; - - if (nslot == ndoms) { - static int warnings = 10; - if (warnings) { - printk(KERN_WARNING - "rebuild_sched_domains confused:" - " nslot %d, ndoms %d, csn %d, i %d," - " apn %d\n", - nslot, ndoms, csn, i, apn); - warnings--; - } - continue; + if (apn < 0) { + /* Skip completed partitions */ + continue; + } + + dp = doms + nslot; + + if (nslot == ndoms) { + static int warnings = 10; + if (warnings) { + printk(KERN_WARNING + "rebuild_sched_domains confused:" + " nslot %d, ndoms %d, csn %d, i %d," + " apn %d\n", + nslot, ndoms, csn, i, apn); + warnings--; } + continue; + } - cpus_clear(*dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; - for (j = i; j < csn; j++) { - struct cpuset *b = csa[j]; - - if (apn == b->pn) { - cpus_or(*dp, *dp, b->cpus_allowed); - b->pn = -1; - if (dattr) - update_domain_attr_tree(dattr - + nslot, b); - } + cpus_clear(*dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + for (j = i; j < csn; j++) { + struct cpuset *b = csa[j]; + + if (apn == b->pn) { + cpus_or(*dp, *dp, b->cpus_allowed); + if (dattr) + update_domain_attr_tree(dattr + nslot, b); + + /* Done with this partition */ + b->pn = -1; } - nslot++; } + nslot++; } BUG_ON(nslot != ndoms); -rebuild: - /* Have scheduler rebuild sched domains */ +done: + kfree(csa); + + *domains = doms; + *attributes = dattr; + return ndoms; +} + +/* + * Rebuild scheduler domains. + * + * Call with neither cgroup_mutex held nor within get_online_cpus(). + * Takes both cgroup_mutex and get_online_cpus(). + * + * Cannot be directly called from cpuset code handling changes + * to the cpuset pseudo-filesystem, because it cannot be called + * from code that already holds cgroup_mutex. + */ +static void do_rebuild_sched_domains(struct work_struct *unused) +{ + struct sched_domain_attr *attr; + cpumask_t *doms; + int ndoms; + get_online_cpus(); - partition_sched_domains(ndoms, doms, dattr); + + /* Generate domain masks and attrs */ + cgroup_lock(); + ndoms = generate_sched_domains(&doms, &attr); + cgroup_unlock(); + + /* Have scheduler rebuild the domains */ + partition_sched_domains(ndoms, doms, attr); + put_online_cpus(); +} -done: - kfree(csa); - /* Don't kfree(doms) -- partition_sched_domains() does that. */ - /* Don't kfree(dattr) -- partition_sched_domains() does that. */ +static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); + +/* + * Rebuild scheduler domains, asynchronously via workqueue. + * + * If the flag 'sched_load_balance' of any cpuset with non-empty + * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset + * which has that flag enabled, or if any cpuset with a non-empty + * 'cpus' is removed, then call this routine to rebuild the + * scheduler's dynamic sched domains. + * + * The rebuild_sched_domains() and partition_sched_domains() + * routines must nest cgroup_lock() inside get_online_cpus(), + * but such cpuset changes as these must nest that locking the + * other way, holding cgroup_lock() for much of the code. + * + * So in order to avoid an ABBA deadlock, the cpuset code handling + * these user changes delegates the actual sched domain rebuilding + * to a separate workqueue thread, which ends up processing the + * above do_rebuild_sched_domains() function. + */ +static void async_rebuild_sched_domains(void) +{ + schedule_work(&rebuild_sched_domains_work); +} + +/* + * Accomplishes the same scheduler domain rebuild as the above + * async_rebuild_sched_domains(), however it directly calls the + * rebuild routine synchronously rather than calling it via an + * asynchronous work thread. + * + * This can only be called from code that is not holding + * cgroup_mutex (not nested in a cgroup_lock() call.) + */ +void rebuild_sched_domains(void) +{ + do_rebuild_sched_domains(NULL); } /** @@ -863,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf) return retval; if (is_load_balanced) - rebuild_sched_domains(); + async_rebuild_sched_domains(); return 0; } @@ -1090,7 +1159,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) if (val != cs->relax_domain_level) { cs->relax_domain_level = val; if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains(); + async_rebuild_sched_domains(); } return 0; @@ -1131,7 +1200,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, mutex_unlock(&callback_mutex); if (cpus_nonempty && balance_flag_changed) - rebuild_sched_domains(); + async_rebuild_sched_domains(); return 0; } @@ -1492,6 +1561,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) default: BUG(); } + + /* Unreachable but makes gcc happy */ + return 0; } static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) @@ -1504,6 +1576,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) default: BUG(); } + + /* Unrechable but makes gcc happy */ + return 0; } @@ -1692,15 +1767,9 @@ static struct cgroup_subsys_state *cpuset_create( } /* - * Locking note on the strange update_flag() call below: - * * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains(). The get_online_cpus() - * call in rebuild_sched_domains() must not be made while holding - * callback_mutex. Elsewhere the kernel nests callback_mutex inside - * get_online_cpus() calls. So the reverse nesting would risk an - * ABBA deadlock. + * will call async_rebuild_sched_domains(). */ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) @@ -1719,7 +1788,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) struct cgroup_subsys cpuset_subsys = { .name = "cpuset", .create = cpuset_create, - .destroy = cpuset_destroy, + .destroy = cpuset_destroy, .can_attach = cpuset_can_attach, .attach = cpuset_attach, .populate = cpuset_populate, @@ -1811,7 +1880,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) } /* - * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs + * If CPU and/or memory hotplug handlers, below, unplug any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the * last CPU or node from a cpuset, then move the tasks in the empty @@ -1903,35 +1972,6 @@ static void scan_for_empty_cpusets(const struct cpuset *root) } /* - * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track - * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to - * track what's online after any CPU or memory node hotplug or unplug event. - * - * Since there are two callers of this routine, one for CPU hotplug - * events and one for memory node hotplug events, we could have coded - * two separate routines here. We code it as a single common routine - * in order to minimize text size. - */ - -static void common_cpu_mem_hotplug_unplug(int rebuild_sd) -{ - cgroup_lock(); - - top_cpuset.cpus_allowed = cpu_online_map; - top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - scan_for_empty_cpusets(&top_cpuset); - - /* - * Scheduler destroys domains on hotplug events. - * Rebuild them based on the current settings. - */ - if (rebuild_sd) - rebuild_sched_domains(); - - cgroup_unlock(); -} - -/* * The top_cpuset tracks what CPUs and Memory Nodes are online, * period. This is necessary in order to make cpusets transparent * (of no affect) on systems that are actively using CPU hotplug @@ -1939,40 +1979,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd) * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_online_map on each CPU hotplug (cpuhp) event. + * + * Called within get_online_cpus(). Needs to call cgroup_lock() + * before calling generate_sched_domains(). */ - -static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, +static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { + struct sched_domain_attr *attr; + cpumask_t *doms; + int ndoms; + switch (phase) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: case CPU_ONLINE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - common_cpu_mem_hotplug_unplug(1); break; + default: return NOTIFY_DONE; } + cgroup_lock(); + top_cpuset.cpus_allowed = cpu_online_map; + scan_for_empty_cpusets(&top_cpuset); + ndoms = generate_sched_domains(&doms, &attr); + cgroup_unlock(); + + /* Have scheduler rebuild the domains */ + partition_sched_domains(ndoms, doms, attr); + return NOTIFY_OK; } #ifdef CONFIG_MEMORY_HOTPLUG /* * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. - * Call this routine anytime after you change - * node_states[N_HIGH_MEMORY]. - * See also the previous routine cpuset_handle_cpuhp(). + * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. + * See also the previous routine cpuset_track_online_cpus(). */ - void cpuset_track_online_nodes(void) { - common_cpu_mem_hotplug_unplug(0); + cgroup_lock(); + top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + scan_for_empty_cpusets(&top_cpuset); + cgroup_unlock(); } #endif @@ -1987,7 +2039,7 @@ void __init cpuset_init_smp(void) top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - hotcpu_notifier(cpuset_handle_cpuhp, 0); + hotcpu_notifier(cpuset_track_online_cpus, 0); } /** -- cgit v1.1 From 293a17ebc944c958e24e6ffbd1d5a49abdbf489e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Aug 2008 17:25:03 -0700 Subject: rcu: prevent console flood when one CPU sees another AWOL via RCU One small change needed to keep from flooding the console when one CPU notices that another is AWOL. Unless I am missing something subtle. Otherwise the cleanups look good! Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 56b8712..dab2676 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -308,6 +308,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) spin_unlock(&rcp->lock); return; } + rcp->gp_check = get_seconds() + 30; spin_unlock(&rcp->lock); /* OK, time to rat on our buddy... */ -- cgit v1.1 From 1f7b94cd3d564901f9e04a8bc5832ae7bfd690a0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Aug 2008 09:21:44 -0700 Subject: rcu: classic RCU locking and memory-barrier cleanups This patch simplifies the locking and memory-barrier usage in the Classic RCU grace-period-detection mechanism, incorporating Lai Jiangshan's feedback from the earlier version (http://lkml.org/lkml/2008/8/1/400 and http://lkml.org/lkml/2008/8/3/43). Passed 10 hours of rcutorture concurrent with CPUs being put online and taken offline on a 128-hardware-thread Power machine. My apologies to whoever in the Eastern Hemisphere was planning to use this machine over the Western Hemisphere night, but it was sitting idle and... So this is ready for tip/core/rcu. This patch is in preparation for moving to a hierarchical algorithm to allow the very large SMP machines -- requested by some people at OLS, and there seem to have been a few recent patches in the 4096-CPU direction as well. The general idea is to move to a much more conservative concurrency design, then apply a hierarchy to reduce contention on the global lock by a few orders of magnitude (larger machines would see greater reductions). The reason for taking a conservative approach is that this code isn't on any fast path. Prototype in progress. This patch is against the linux-tip git tree (tip/core/rcu). If you wish to test this against 2.6.26, use the following set of patches: http://www.rdrop.com/users/paulmck/patches/2.6.26-ljsimp-1.patch http://www.rdrop.com/users/paulmck/patches/2.6.26-ljsimpfix-3.patch The first patch combines commits 5127bed588a2f8f3a1f732de2a8a190b7df5dce3 and 3cac97cbb14aed00d83eb33d4613b0fe3aaea863 from Lai Jiangshan , and the second patch contains my changes. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 51 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index dab2676..5de1266 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -87,6 +87,7 @@ static void force_quiescent_state(struct rcu_data *rdp, int cpu; cpumask_t cpumask; set_need_resched(); + spin_lock(&rcp->lock); if (unlikely(!rcp->signaled)) { rcp->signaled = 1; /* @@ -112,6 +113,7 @@ static void force_quiescent_state(struct rcu_data *rdp, for_each_cpu_mask_nr(cpu, cpumask) smp_send_reschedule(cpu); } + spin_unlock(&rcp->lock); } #else static inline void force_quiescent_state(struct rcu_data *rdp, @@ -125,7 +127,9 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { long batch; - smp_mb(); /* reads the most recently updated value of rcu->cur. */ + + head->next = NULL; + smp_mb(); /* Read of rcu->cur must happen after any change by caller. */ /* * Determine the batch number of this callback. @@ -175,7 +179,6 @@ void call_rcu(struct rcu_head *head, unsigned long flags; head->func = func; - head->next = NULL; local_irq_save(flags); __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data)); local_irq_restore(flags); @@ -204,7 +207,6 @@ void call_rcu_bh(struct rcu_head *head, unsigned long flags; head->func = func; - head->next = NULL; local_irq_save(flags); __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); local_irq_restore(flags); @@ -467,17 +469,17 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, static void __rcu_offline_cpu(struct rcu_data *this_rdp, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* if the cpu going offline owns the grace period + /* + * if the cpu going offline owns the grace period * we can block indefinitely waiting for it, so flush * it here */ spin_lock_bh(&rcp->lock); if (rcp->cur != rcp->completed) cpu_quiet(rdp->cpu, rcp); - spin_unlock_bh(&rcp->lock); - /* spin_lock implies smp_mb() */ rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); + spin_unlock_bh(&rcp->lock); local_irq_disable(); this_rdp->qlen += rdp->qlen; @@ -511,16 +513,19 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + long completed_snap; + if (rdp->nxtlist) { local_irq_disable(); + completed_snap = ACCESS_ONCE(rcp->completed); /* * move the other grace-period-completed entries to * [rdp->nxtlist, *rdp->nxttail[0]) temporarily */ - if (!rcu_batch_before(rcp->completed, rdp->batch)) + if (!rcu_batch_before(completed_snap, rdp->batch)) rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2]; - else if (!rcu_batch_before(rcp->completed, rdp->batch - 1)) + else if (!rcu_batch_before(completed_snap, rdp->batch - 1)) rdp->nxttail[0] = rdp->nxttail[1]; /* @@ -561,8 +566,24 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, static void rcu_process_callbacks(struct softirq_action *unused) { + /* + * Memory references from any prior RCU read-side critical sections + * executed by the interrupted code must be see before any RCU + * grace-period manupulations below. + */ + + smp_mb(); /* See above block comment. */ + __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); + + /* + * Memory references from any later RCU read-side critical sections + * executed by the interrupted code must be see after any RCU + * grace-period manupulations above. + */ + + smp_mb(); /* See above block comment. */ } static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) @@ -571,13 +592,15 @@ static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) check_cpu_stall(rcp, rdp); if (rdp->nxtlist) { + long completed_snap = ACCESS_ONCE(rcp->completed); + /* * This cpu has pending rcu entries and the grace period * for them has completed. */ - if (!rcu_batch_before(rcp->completed, rdp->batch)) + if (!rcu_batch_before(completed_snap, rdp->batch)) return 1; - if (!rcu_batch_before(rcp->completed, rdp->batch - 1) && + if (!rcu_batch_before(completed_snap, rdp->batch - 1) && rdp->nxttail[0] != rdp->nxttail[1]) return 1; if (rdp->nxttail[0] != &rdp->nxtlist) @@ -628,6 +651,12 @@ int rcu_needs_cpu(int cpu) return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu); } +/* + * Top-level function driving RCU grace-period detection, normally + * invoked from the scheduler-clock interrupt. This function simply + * increments counters that are read only from softirq by this same + * CPU, so there are no memory barriers required. + */ void rcu_check_callbacks(int cpu, int user) { if (user || @@ -671,6 +700,7 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + spin_lock(&rcp->lock); memset(rdp, 0, sizeof(*rdp)); rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; rdp->donetail = &rdp->donelist; @@ -678,6 +708,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, rdp->qs_pending = 0; rdp->cpu = cpu; rdp->blimit = blimit; + spin_unlock(&rcp->lock); } static void __cpuinit rcu_online_cpu(int cpu) -- cgit v1.1 From 5802294f1b1895ee19a3d0ae72805da453afb9de Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 30 Jul 2008 14:20:55 -0400 Subject: rcu: trace fix possible mem-leak In the initialization of the RCU trace module, if rcupreempt_debugfs_init() fails, we never free the the trace buffer. This patch frees the trace buffer in case the debugfs fails. Signed-off-by: Steven Rostedt Reviewed-by: "Paul E. McKenney" Signed-off-by: Ingo Molnar --- kernel/rcupreempt_trace.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c index 5edf82c..35c2d33 100644 --- a/kernel/rcupreempt_trace.c +++ b/kernel/rcupreempt_trace.c @@ -308,11 +308,16 @@ out: static int __init rcupreempt_trace_init(void) { + int ret; + mutex_init(&rcupreempt_trace_mutex); rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); if (!rcupreempt_trace_buf) return 1; - return rcupreempt_debugfs_init(); + ret = rcupreempt_debugfs_init(); + if (ret) + kfree(rcupreempt_trace_buf); + return ret; } static void __exit rcupreempt_trace_cleanup(void) -- cgit v1.1 From cd95851785bcfe95fdf73689e8ecb5a1c5959231 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 17 Aug 2008 07:37:15 -0700 Subject: rcu: fix classic RCU locking cleanup lockdep problem On Fri, Aug 15, 2008 at 04:24:30PM +0200, Ingo Molnar wrote: > > Paul, > > one of your two recent RCU patches caused this lockdep splat in -tip > testing: > > -------------------> > Brought up 2 CPUs > Total of 2 processors activated (6850.87 BogoMIPS). > PM: Adding info for No Bus:platform > khelper used greatest stack depth: 3124 bytes left > > ================================= > [ INFO: inconsistent lock state ] > 2.6.27-rc3-tip #1 > --------------------------------- > inconsistent {softirq-on-W} -> {in-softirq-W} usage. > ksoftirqd/0/4 [HC0[0]:SC1[1]:HE1:SE0] takes: > (&rcu_ctrlblk.lock){-+..}, at: [] __rcu_process_callbacks+0x1ac/0x1f0 > {softirq-on-W} state was registered at: > [] __lock_acquire+0x3f4/0x5b0 > [] lock_acquire+0x89/0xc0 > [] _spin_lock+0x3b/0x70 > [] rcu_init_percpu_data+0x29/0x80 > [] rcu_cpu_notify+0xaf/0xd0 > [] notifier_call_chain+0x2d/0x60 > [] __raw_notifier_call_chain+0x1e/0x30 > [] _cpu_up+0x79/0x110 > [] cpu_up+0x4d/0x70 > [] kernel_init+0xb1/0x200 > [] kernel_thread_helper+0x7/0x10 > [] 0xffffffff > irq event stamp: 14 > hardirqs last enabled at (14): [] trace_hardirqs_on+0xb/0x10 > hardirqs last disabled at (13): [] trace_hardirqs_off+0xb/0x10 > softirqs last enabled at (0): [] copy_process+0x276/0x1190 > softirqs last disabled at (11): [] call_on_stack+0x1a/0x30 > > other info that might help us debug this: > no locks held by ksoftirqd/0/4. > > stack backtrace: > Pid: 4, comm: ksoftirqd/0 Not tainted 2.6.27-rc3-tip #1 > [] print_usage_bug+0x16c/0x1b0 > [] mark_lock+0xa75/0xb10 > [] ? sched_clock+0x15/0x30 > [] __lock_acquire+0x3ad/0x5b0 > [] lock_acquire+0x89/0xc0 > [] ? __rcu_process_callbacks+0x1ac/0x1f0 > [] _spin_lock+0x3b/0x70 > [] ? __rcu_process_callbacks+0x1ac/0x1f0 > [] __rcu_process_callbacks+0x1ac/0x1f0 > [] rcu_process_callbacks+0x26/0x50 > [] __do_softirq+0x95/0x120 > [] ? __do_softirq+0x0/0x120 > [] call_on_stack+0x1a/0x30 > [] ? ksoftirqd+0x96/0x110 > [] ? ksoftirqd+0x0/0x110 > [] ? kthread+0x47/0x80 > [] ? kthread+0x0/0x80 > [] ? kernel_thread_helper+0x7/0x10 > ======================= > calling init_cpufreq_transition_notifier_list+0x0/0x20 > initcall init_cpufreq_transition_notifier_list+0x0/0x20 returned 0 after 0 msecs > calling net_ns_init+0x0/0x190 > net_namespace: 676 bytes > initcall net_ns_init+0x0/0x190 returned 0 after 0 msecs > calling cpufreq_tsc+0x0/0x20 > initcall cpufreq_tsc+0x0/0x20 returned 0 after 0 msecs > calling reboot_init+0x0/0x20 > initcall reboot_init+0x0/0x20 returned 0 after 0 msecs > calling print_banner+0x0/0x10 > Booting paravirtualized kernel on bare hardware > > <----------------------- > > my guess is on: > > commit 1f7b94cd3d564901f9e04a8bc5832ae7bfd690a0 > Author: Paul E. McKenney > Date: Tue Aug 5 09:21:44 2008 -0700 > > rcu: classic RCU locking and memory-barrier cleanups > > Ingo Fixes a problem detected by lockdep in which rcu->lock was acquired both in irq context and in process context, but without disabling from process context. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 5de1266..fb1f1cc4 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -700,7 +700,9 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - spin_lock(&rcp->lock); + long flags; + + spin_lock_irqsave(&rcp->lock, flags); memset(rdp, 0, sizeof(*rdp)); rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; rdp->donetail = &rdp->donelist; @@ -708,7 +710,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, rdp->qs_pending = 0; rdp->cpu = cpu; rdp->blimit = blimit; - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } static void __cpuinit rcu_online_cpu(int cpu) -- cgit v1.1 From ded00a56e99555c3f4000ef3eebfd5fe0d574565 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 17 Aug 2008 12:50:36 -0700 Subject: rcu: remove redundant ACCESS_ONCE definition from rcupreempt.c Remove the redundant definition of ACCESS_ONCE() from rcupreempt.c in favor of the one in compiler.h. Also merge the comment header from rcupreempt.c's definition into that in compiler.h. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 2782793..ca4bbbe 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -59,14 +59,6 @@ #include /* - * Macro that prevents the compiler from reordering accesses, but does - * absolutely -nothing- to prevent CPUs from reordering. This is used - * only to mediate communication between mainline code and hardware - * interrupt and NMI handlers. - */ -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) - -/* * PREEMPT_RCU data structures. */ -- cgit v1.1 From eff9b713ee3540ddab862095aaf4b1511a6758bc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Aug 2008 17:51:08 -0700 Subject: rcu: fix locking cleanup fallout Given that the rcp->lock is now acquired from call_rcu(), which can be invoked from irq-disable regions, all acquisitions need to disable irqs. The following patch fixes this. Although I don't have any reason to believe that this is the cause of Yinghai's oops, it does need to be fixed. Signed-off-by: Paul E. McKenney Cc: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index fb1f1cc4..c6b6cf5 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -86,8 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp, { int cpu; cpumask_t cpumask; + unsigned long flags; + set_need_resched(); - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if (unlikely(!rcp->signaled)) { rcp->signaled = 1; /* @@ -113,7 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp, for_each_cpu_mask_nr(cpu, cpumask) smp_send_reschedule(cpu); } - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } #else static inline void force_quiescent_state(struct rcu_data *rdp, @@ -301,17 +303,18 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) { int cpu; long delta; + unsigned long flags; /* Only let one CPU complain about others per time interval. */ - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); delta = get_seconds() - rcp->gp_check; if (delta < 2L || cpus_empty(rcp->cpumask)) { spin_unlock(&rcp->lock); return; } rcp->gp_check = get_seconds() + 30; - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); /* OK, time to rat on our buddy... */ @@ -324,13 +327,15 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) static void print_cpu_stall(struct rcu_ctrlblk *rcp) { + unsigned long flags; + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", smp_processor_id(), get_seconds(), rcp->gp_check); dump_stack(); - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if ((long)(get_seconds() - rcp->gp_check) >= 0L) rcp->gp_check = get_seconds() + 30; - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) @@ -413,6 +418,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + if (rdp->quiescbatch != rcp->cur) { /* start new grace period: */ rdp->qs_pending = 1; @@ -436,7 +443,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, return; rdp->qs_pending = 0; - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); /* * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync * during cpu startup. Ignore the quiescent state. @@ -444,7 +451,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, if (likely(rdp->quiescbatch == rcp->cur)) cpu_quiet(rdp->cpu, rcp); - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } @@ -469,21 +476,22 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, static void __rcu_offline_cpu(struct rcu_data *this_rdp, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + /* * if the cpu going offline owns the grace period * we can block indefinitely waiting for it, so flush * it here */ - spin_lock_bh(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if (rcp->cur != rcp->completed) cpu_quiet(rdp->cpu, rcp); rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); - spin_unlock_bh(&rcp->lock); + spin_unlock(&rcp->lock); - local_irq_disable(); this_rdp->qlen += rdp->qlen; - local_irq_enable(); + local_irq_restore(flags); } static void rcu_offline_cpu(int cpu) @@ -550,12 +558,12 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, if (rcu_batch_after(rdp->batch, rcp->pending)) { /* and start it/schedule start if it's a new batch */ - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if (rcu_batch_after(rdp->batch, rcp->pending)) { rcp->pending = rdp->batch; rcu_start_batch(rcp); } - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } } -- cgit v1.1 From 0c925d79234fe77589d8ff3861f9f8bb9e7fc3f6 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 18 Aug 2008 21:49:51 -0700 Subject: rcuclassic: fix compilation NG fix: CC kernel/rcuclassic.o kernel/rcuclassic.c: In function '__rcu_process_callbacks': kernel/rcuclassic.c:561: error: 'flags' undeclared (first use in this function) kernel/rcuclassic.c:561: error: (Each undeclared identifier is reported only once kernel/rcuclassic.c:561: error: for each function it appears in.) Declare missing variable flags. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index c6b6cf5..01e761a 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -557,6 +557,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, local_irq_enable(); if (rcu_batch_after(rdp->batch, rcp->pending)) { + unsigned long flags; + /* and start it/schedule start if it's a new batch */ spin_lock_irqsave(&rcp->lock, flags); if (rcu_batch_after(rdp->batch, rcp->pending)) { -- cgit v1.1 From af4491e51632d01fbc2b856ffa9ebcd4b38db68c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:02 +0200 Subject: sched: rt-bandwidth for user grouping interface rt_runtime is a signed value Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 865ecf57..39d6159 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj, { struct user_struct *up = container_of(kobj, struct user_struct, kobj); - return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); + return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg)); } static ssize_t cpu_rt_runtime_store(struct kobject *kobj, @@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, unsigned long rt_runtime; int rc; - sscanf(buf, "%lu", &rt_runtime); + sscanf(buf, "%ld", &rt_runtime); rc = sched_group_set_rt_runtime(up->tg, rt_runtime); -- cgit v1.1 From 6f0d5c390e4206dcb3804a5072a048fdb7d2b428 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:03 +0200 Subject: sched: rt-bandwidth accounting fix It fixes an accounting bug where we would continue accumulating runtime even though the bandwidth control is disabled. This would lead to very long throttle periods once bandwidth control gets turned on again. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 998ba54b..77340b0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -438,9 +438,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); - if (runtime == RUNTIME_INF) - return 0; - if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); @@ -491,9 +488,11 @@ static void update_curr_rt(struct rq *rq) rt_rq = rt_rq_of_se(rt_se); spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { + rt_rq->rt_time += delta_exec; + if (sched_rt_runtime_exceeded(rt_rq)) + resched_task(curr); + } spin_unlock(&rt_rq->rt_runtime_lock); } } -- cgit v1.1 From 0b148fa04852859972abbf848177b92daeef138a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:04 +0200 Subject: sched: rt-bandwidth group disable fixes More extensive disable of bandwidth control. It allows sysctl_sched_rt_runtime to disable full group bandwidth control. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 ++++++++- kernel/sched_rt.c | 5 ++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9a1ddb8..c1bee5f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -204,11 +204,13 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; } +static inline int rt_bandwidth_enabled(void); + static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_b->rt_runtime == RUNTIME_INF) + if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) @@ -839,6 +841,11 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 77340b0..94daace 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -386,7 +386,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int i, idle = 1; cpumask_t span; - if (rt_b->rt_runtime == RUNTIME_INF) + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); @@ -484,6 +484,9 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); + if (!rt_bandwidth_enabled()) + return; + for_each_sched_rt_entity(rt_se) { rt_rq = rt_rq_of_se(rt_se); -- cgit v1.1 From eb755805f21bd5ded84026e167b7a90887ac42e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:05 +0200 Subject: sched: extract walk_tg_tree() Extract walk_tg_tree() and make it a little more generic so we can use it in the schedulablity test. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 79 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c1bee5f..8c019a1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1387,38 +1387,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->nr_running) - rq->avg_load_per_task = rq->load.weight / rq->nr_running; - - return rq->avg_load_per_task; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED - -typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) +typedef int (*tg_visitor)(struct task_group *, void *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -static void -walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) +static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; + int ret; rcu_read_lock(); parent = &root_task_group; down: - (*down)(parent, cpu, sd); + ret = (*down)(parent, data); + if (ret) + goto out_unlock; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1426,14 +1412,42 @@ down: up: continue; } - (*up)(parent, cpu, sd); + ret = (*up)(parent, data); + if (ret) + goto out_unlock; child = parent; parent = parent->parent; if (parent) goto up; +out_unlock: rcu_read_unlock(); + + return ret; +} + +static int tg_nop(struct task_group *tg, void *data) +{ + return 0; } +#endif + +#ifdef CONFIG_SMP +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->nr_running) + rq->avg_load_per_task = rq->load.weight / rq->nr_running; + + return rq->avg_load_per_task; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1493,11 +1507,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * This needs to be done in a bottom-up fashion because the rq weight of a * parent group depends on the shares of its child groups. */ -static void -tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_shares_up(struct task_group *tg, void *data) { unsigned long rq_weight = 0; unsigned long shares = 0; + struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { @@ -1522,6 +1536,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) __update_group_shares_cpu(tg, i, shares, rq_weight); spin_unlock_irqrestore(&rq->lock, flags); } + + return 0; } /* @@ -1529,10 +1545,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static void -tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_load_down(struct task_group *tg, void *data) { unsigned long load; + long cpu = (long)data; if (!tg->parent) { load = cpu_rq(cpu)->load.weight; @@ -1543,11 +1559,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) } tg->cfs_rq[cpu]->h_load = load; -} -static void -tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) -{ + return 0; } static void update_shares(struct sched_domain *sd) @@ -1557,7 +1570,7 @@ static void update_shares(struct sched_domain *sd) if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + walk_tg_tree(tg_nop, tg_shares_up, sd); } } @@ -1568,9 +1581,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) spin_lock(&rq->lock); } -static void update_h_load(int cpu) +static void update_h_load(long cpu) { - walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } #else -- cgit v1.1 From 9a7e0b180da21885988d47558671cf580279f9d6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:06 +0200 Subject: sched: rt-bandwidth fixes The last patch allows sysctl_sched_rt_runtime to disable bandwidth accounting for the group scheduler - however it doesn't deal with sched_setscheduler(), which will keep tasks out of groups that have no assigned runtime. If we relax this, we get into the situation where RT tasks can get into a group when we disable bandwidth control, and then starve them by enabling it again. Rework the schedulability code to check for this condition and fail to turn on bandwidth control with -EBUSY when this situation is found. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 125 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 63 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8c019a1..e41bdae 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -300,9 +300,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_USER_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -1387,7 +1387,7 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(SCHED_RT_GROUP_SCHED) typedef int (*tg_visitor)(struct task_group *, void *); /* @@ -5082,7 +5082,8 @@ recheck: * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@ -8707,73 +8708,77 @@ static DEFINE_MUTEX(rt_constraints_mutex); static unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) - return 1ULL << 16; + return 1ULL << 20; - return div64_u64(runtime << 16, period); + return div64_u64(runtime << 20, period); } -#ifdef CONFIG_CGROUP_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) { - struct task_group *tgi, *parent = tg->parent; - unsigned long total = 0; + struct task_struct *g, *p; - if (!parent) { - if (global_rt_period() < period) - return 0; + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); - return to_ratio(period, runtime) < - to_ratio(global_rt_period(), global_rt_runtime()); - } + return 0; +} - if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) - return 0; +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &parent->children, siblings) { - if (tgi == tg) - continue; +static int tg_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; + + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; } - rcu_read_unlock(); - return total + to_ratio(period, runtime) <= - to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), - parent->rt_bandwidth.rt_runtime); -} -#elif defined CONFIG_USER_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - struct task_group *tgi; - unsigned long total = 0; - unsigned long global_ratio = - to_ratio(global_rt_period(), global_rt_runtime()); + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &task_groups, list) { - if (tgi == tg) - continue; + total = to_ratio(period, runtime); - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); } - rcu_read_unlock(); - return total + to_ratio(period, runtime) < global_ratio; + if (sum > total) + return -EINVAL; + + return 0; } -#endif -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { - struct task_struct *g, *p; - do_each_thread(g, p) { - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) - return 1; - } while_each_thread(g, p); - return 0; + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + return walk_tg_tree(tg_schedulable, tg_nop, &data); } static int tg_set_bandwidth(struct task_group *tg, @@ -8783,14 +8788,9 @@ static int tg_set_bandwidth(struct task_group *tg, mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { - err = -EBUSY; + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) goto unlock; - } - if (!__rt_schedulable(tg, rt_period, rt_runtime)) { - err = -EINVAL; - goto unlock; - } spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); @@ -8867,8 +8867,9 @@ static int sched_rt_global_constraints(void) rt_runtime = tg->rt_bandwidth.rt_runtime; mutex_lock(&rt_constraints_mutex); - if (!__rt_schedulable(tg, rt_period, rt_runtime)) - ret = -EINVAL; + read_lock(&tasklist_lock); + ret = __rt_schedulable(tg, rt_period, rt_runtime); + read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return ret; -- cgit v1.1 From fa33507a22623b3bd543b15a21c362cf364b6cff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Aug 2008 09:31:26 +0200 Subject: printk: robustify printk, fix #2 Dmitry Adamushko reported: > [*] btw., with DEBUG being enabled, pr_debug() generates [1] when > debug_smp_processor_id() is used (CONFIG_DEBUG_PREEMPT). > > the problem seems to be caused by the following commit: > commit b845b517b5e3706a3729f6ea83b88ab85f0725b0 > Author: Peter Zijlstra > Date: Fri Aug 8 21:47:09 2008 +0200 > > printk: robustify printk > > > wake_up_klogd() -> __get_cpu_var() -> smp_processor_id() > > and that's being called from release_console_sem() which is, in turn, > said to be "may be called from any context" [2] > > and in this case, it seems to be called from some non-preemptible > context (although, it can't be printk()... > although, I haven't looked carefully yet). > > Provided [2], __get_cpu_var() is perhaps not the right solution there. > > > [1] > > [ 7697.942005] BUG: using smp_processor_id() in preemptible [00000000] code: syslogd/3542 > [ 7697.942005] caller is wake_up_klogd+0x1b/0x50 > [ 7697.942005] Pid: 3542, comm: syslogd Not tainted 2.6.27-rc3-tip-git #2 > [ 7697.942005] Call Trace: > [ 7697.942005] [] debug_smp_processor_id+0xe8/0xf0 > [ 7697.942005] [] wake_up_klogd+0x1b/0x50 > [ 7697.942005] [] release_console_sem+0x1e7/0x200 > [ 7697.942005] [] do_con_write+0xb7/0x1f30 > [ 7697.942005] [] ? show_trace+0x10/0x20 > [ 7697.942005] [] ? dump_stack+0x72/0x80 > [ 7697.942005] [] ? __ratelimit+0xbd/0xe0 > [ 7697.942005] [] ? debug_smp_processor_id+0xe8/0xf0 > [ 7697.942005] [] ? wake_up_klogd+0x1b/0x50 > [ 7697.942005] [] ? release_console_sem+0x1e7/0x200 > [ 7697.942005] [] con_write+0x19/0x30 > [ 7697.942005] [] write_chan+0x276/0x3c0 > [ 7697.942005] [] ? default_wake_function+0x0/0x10 > [ 7697.942005] [] ? _spin_lock_irqsave+0x22/0x50 > [ 7697.942005] [] tty_write+0x194/0x260 > [ 7697.942005] [] ? write_chan+0x0/0x3c0 > [ 7697.942005] [] redirected_tty_write+0xa4/0xb0 > [ 7697.942005] [] ? redirected_tty_write+0x0/0xb0 > [ 7697.942005] [] do_loop_readv_writev+0x52/0x80 > [ 7697.942005] [] do_readv_writev+0x1bd/0x1d0 > [ 7697.942005] [] vfs_writev+0x39/0x60 > [ 7697.942005] [] sys_writev+0x50/0x90 > [ 7697.942005] [] system_call_fastpath+0x16/0x1b Signed-off-by: Peter Zijlstra Reported-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 655cc2c..57e9cd7 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1000,7 +1000,7 @@ int printk_needs_cpu(int cpu) void wake_up_klogd(void) { if (waitqueue_active(&log_wait)) - __get_cpu_var(printk_pending) = 1; + __raw_get_cpu_var(printk_pending) = 1; } /** -- cgit v1.1 From 1fa63a817d27af7dc0d5ed454eb8fe5dec65fac7 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Wed, 20 Aug 2008 14:40:55 +0200 Subject: printk: robustify printk, update comment Remove the comment describing the possibility of printk() deadlocking on runqueue lock. Signed-off-by: Jiri Kosina Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/printk.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 57e9cd7..6f27c6a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -577,9 +577,6 @@ static int have_callable_console(void) * @fmt: format string * * This is printk(). It can be called from any context. We want it to work. - * Be aware of the fact that if oops_in_progress is not set, we might try to - * wake klogd up which could deadlock on runqueue lock if printk() is called - * from scheduler code. * * We try to grab the console_sem. If we succeed, it's easy - we log the output and * call the console drivers. If we fail to get the semaphore we place the output -- cgit v1.1 From efc2dead2c82cae31943828f6d977c483942b0eb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Aug 2008 12:44:55 +0200 Subject: sched: enable LB_BIAS by default Yanmin reported a significant regression on his 16-core machine due to: commit 93b75217df39e6d75889cc6f8050343286aff4a5 Author: Peter Zijlstra Date: Fri Jun 27 13:41:33 2008 +0200 Flip back to the old behaviour. Reported-by: "Zhang, Yanmin" Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 862b06b..9353ca7 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -8,6 +8,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) -SCHED_FEAT(LB_BIAS, 0) +SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) -- cgit v1.1 From 01dcb0443ed89eccf26c2b43f1ea13b368ae740d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 20 Aug 2008 16:35:19 -0700 Subject: rcu: fix synchronize_rcu() so that kernel-doc works Fix RCU's synchronize_rcu() so that it looks like a C function, enabling it to be recognized as a function with kernel-doc annotation. Warning(linux-2.6.26-git11//kernel/rcupdate.c:81): No description found for parameter 'synchronize_rcu' Warning(linux-2.6.26-git11//kernel/rcupdate.c:81): No description found for parameter 'call_rcu' [akpm@linux-foundation.org: fix comment] Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f14f372..467d594 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -77,6 +77,7 @@ void wakeme_after_rcu(struct rcu_head *head) * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ +void synchronize_rcu(void); /* Makes kernel-doc tools happy */ synchronize_rcu_xxx(synchronize_rcu, call_rcu) EXPORT_SYMBOL_GPL(synchronize_rcu); -- cgit v1.1 From 3c4fbe5e01d7e5309be5045e7ae0db20a049e6dc Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 20 Aug 2008 16:37:38 -0700 Subject: nohz: fix wrong event handler after online an offlined cpu On the tickless system(CONFIG_NO_HZ=y and CONFIG_HIGH_RES_TIMERS=n), after I made an offlined cpu online, I found this cpu's event handler was tick_handle_periodic, not tick_nohz_handler. After debuging, I found this bug was caused by the wrong tick mode. the tick mode is not changed to NOHZ_MODE_INACTIVE when the cpu is offline. This patch fixes this bug. Signed-off-by: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f5da526..7a46bde 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -643,17 +643,21 @@ void tick_setup_sched_timer(void) ts->nohz_mode = NOHZ_MODE_HIGHRES; #endif } +#endif /* HIGH_RES_TIMERS */ +#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +# ifdef CONFIG_HIGH_RES_TIMERS if (ts->sched_timer.base) hrtimer_cancel(&ts->sched_timer); +# endif ts->nohz_mode = NOHZ_MODE_INACTIVE; } -#endif /* HIGH_RES_TIMERS */ +#endif /** * Async notification about clocksource changes -- cgit v1.1 From d82f0b0f6f1a0a25afc288fb7135b1601fe6df18 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Aug 2008 16:46:04 -0700 Subject: migrate_timers: add comment, use spinlock_irq() Add the comment to explain why the double lock in migrate_timers() can't deadlock. Change the code to use spinlock_irq() instead of local_irq_disable() + spin_lock(). Signed-off-by: Oleg Nesterov Acked-by: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 11 ++++++----- kernel/timer.c | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b8e4dce..03ea137 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1620,9 +1620,11 @@ static void migrate_hrtimers(int cpu) new_base = &get_cpu_var(hrtimer_bases); tick_cancel_sched_timer(cpu); - - local_irq_disable(); - spin_lock(&new_base->lock); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { @@ -1631,8 +1633,7 @@ static void migrate_hrtimers(int cpu) } spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - local_irq_enable(); + spin_unlock_irq(&new_base->lock); put_cpu_var(hrtimer_bases); } #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/timer.c b/kernel/timer.c index 03bc7f1..e8019cc 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1435,9 +1435,11 @@ static void __cpuinit migrate_timers(int cpu) BUG_ON(cpu_online(cpu)); old_base = per_cpu(tvec_bases, cpu); new_base = get_cpu_var(tvec_bases); - - local_irq_disable(); - spin_lock(&new_base->lock); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); BUG_ON(old_base->running_timer); @@ -1452,8 +1454,7 @@ static void __cpuinit migrate_timers(int cpu) } spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - local_irq_enable(); + spin_unlock_irq(&new_base->lock); put_cpu_var(tvec_bases); } #endif /* CONFIG_HOTPLUG_CPU */ -- cgit v1.1 From 275a89bdd3868af3008852594d2e169eaf69441b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Aug 2008 06:14:55 -0700 Subject: rcu: use irq-safe locks Some earlier tip/core/rcu patches caused RCU to incorrectly enable irqs too early in boot. This caused Yinghai's repeated-kexec testing to hit oopses, presumably due to so that device interrupts left over from the prior kernel instance (which would oops the newly booting kernel before it got a chance to reset said devices). This patch therefore converts all the local_irq_disable()s in rcuclassic.c to local_irq_save(). Besides, I never did like local_irq_disable() anyway. ;-) Signed-off-by: Paul E. McKenney Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 01e761a..3f691896 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -247,6 +247,7 @@ static inline void raise_rcu_softirq(void) */ static void rcu_do_batch(struct rcu_data *rdp) { + unsigned long flags; struct rcu_head *next, *list; int count = 0; @@ -261,9 +262,9 @@ static void rcu_do_batch(struct rcu_data *rdp) } rdp->donelist = list; - local_irq_disable(); + local_irq_save(flags); rdp->qlen -= count; - local_irq_enable(); + local_irq_restore(flags); if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; @@ -464,12 +465,14 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, struct rcu_head **tail, long batch) { + unsigned long flags; + if (list) { - local_irq_disable(); + local_irq_save(flags); this_rdp->batch = batch; *this_rdp->nxttail[2] = list; this_rdp->nxttail[2] = tail; - local_irq_enable(); + local_irq_restore(flags); } } @@ -521,10 +524,11 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; long completed_snap; if (rdp->nxtlist) { - local_irq_disable(); + local_irq_save(flags); completed_snap = ACCESS_ONCE(rcp->completed); /* @@ -554,7 +558,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, rdp->nxttail[0] = &rdp->nxtlist; } - local_irq_enable(); + local_irq_restore(flags); if (rcu_batch_after(rdp->batch, rcp->pending)) { unsigned long flags; -- cgit v1.1 From 916c7a855174e3b53d182b97a26b2e27a29726a1 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Wed, 20 Aug 2008 16:46:08 -0700 Subject: ntp: fix ADJ_OFFSET_SS_READ bug and do_adjtimex() cleanup Thanks to the review by Michael Kerrisk a bug in the recent ADJ_OFFSET_SS_READ option was discovered, where the ntp time_offset was inadvertently set by it. This fixes this by making the adjtime code more separate from the ntp_adjtime code (both of which really want to be separate syscalls). Signed-off-by: Roman Zippel Signed-off-by: Andrew Morton Acked-by: John Stultz Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 76 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 40 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5125ddd..c6921aa1 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -277,38 +277,50 @@ static inline void notify_cmos_timer(void) { } int do_adjtimex(struct timex *txc) { struct timespec ts; - long save_adjust, sec; int result; - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - - /* Now we validate the data before disabling interrupts */ - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) { + /* Validate the data before disabling interrupts */ + if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ - if (txc->modes & ~ADJ_OFFSET_SS_READ) + if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) return -EINVAL; + if (!(txc->modes & ADJ_OFFSET_READONLY) && + !capable(CAP_SYS_TIME)) + return -EPERM; + } else { + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + + /* if the quartz is off by more than 10% something is VERY wrong! */ + if (txc->modes & ADJ_TICK && + (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ)) + return -EINVAL; + + if (txc->modes & ADJ_STATUS && time_state != TIME_OK) + hrtimer_cancel(&leap_timer); } - /* if the quartz is off by more than 10% something is VERY wrong ! */ - if (txc->modes & ADJ_TICK) - if (txc->tick < 900000/USER_HZ || - txc->tick > 1100000/USER_HZ) - return -EINVAL; - - if (time_state != TIME_OK && txc->modes & ADJ_STATUS) - hrtimer_cancel(&leap_timer); getnstimeofday(&ts); write_seqlock_irq(&xtime_lock); - /* Save for later - semantics of adjtime is to return old value */ - save_adjust = time_adjust; - /* If there are input parameters, then process them */ + if (txc->modes & ADJ_ADJTIME) { + long save_adjust = time_adjust; + + if (!(txc->modes & ADJ_OFFSET_READONLY)) { + /* adjtime() is independent from ntp_adjtime() */ + time_adjust = txc->offset; + ntp_update_frequency(); + } + txc->offset = save_adjust; + goto adj_done; + } if (txc->modes) { + long sec; + if (txc->modes & ADJ_STATUS) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { @@ -375,13 +387,8 @@ int do_adjtimex(struct timex *txc) if (txc->modes & ADJ_TAI && txc->constant > 0) time_tai = txc->constant; - if (txc->modes & ADJ_OFFSET) { - if (txc->modes == ADJ_OFFSET_SINGLESHOT) - /* adjtime() is independent from ntp_adjtime() */ - time_adjust = txc->offset; - else - ntp_update_offset(txc->offset); - } + if (txc->modes & ADJ_OFFSET) + ntp_update_offset(txc->offset); if (txc->modes & ADJ_TICK) tick_usec = txc->tick; @@ -389,19 +396,16 @@ int do_adjtimex(struct timex *txc) ntp_update_frequency(); } + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, + NTP_SCALE_SHIFT); + if (!(time_status & STA_NANO)) + txc->offset /= NSEC_PER_USEC; + +adj_done: result = time_state; /* mostly `TIME_OK' */ if (time_status & (STA_UNSYNC|STA_CLOCKERR)) result = TIME_ERROR; - if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || - (txc->modes == ADJ_OFFSET_SS_READ)) - txc->offset = save_adjust; - else { - txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, - NTP_SCALE_SHIFT); - if (!(time_status & STA_NANO)) - txc->offset /= NSEC_PER_USEC; - } txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) * (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); -- cgit v1.1 From a38409fbb5181324fb73b359189a72e02b6c7030 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 Aug 2008 12:16:09 +0200 Subject: dma-coherent: export dma_[alloc|release]_from_coherent methods fixes modular builds: ERROR: "dma_alloc_from_coherent" [sound/core/snd-page-alloc.ko] undefined! ERROR: "dma_release_from_coherent" [sound/core/snd-page-alloc.ko] undefined! Signed-off-by: Ingo Molnar --- kernel/dma-coherent.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index c1d4d5b..f013a0c 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -124,6 +124,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, } return (mem != NULL); } +EXPORT_SYMBOL(dma_alloc_from_coherent); /** * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool @@ -151,3 +152,4 @@ int dma_release_from_coherent(struct device *dev, int order, void *vaddr) } return 0; } +EXPORT_SYMBOL(dma_release_from_coherent); -- cgit v1.1 From 94d3d8247de22c5b0624aa00616ceca459498e55 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Aug 2008 16:54:41 -0700 Subject: sched: do_wait_for_common: use signal_pending_state() Change do_wait_for_common() to use signal_pending_state() instead of open coding. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d601fb0..da7c5d2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4599,10 +4599,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { - if ((state == TASK_INTERRUPTIBLE && - signal_pending(current)) || - (state == TASK_KILLABLE && - fatal_signal_pending(current))) { + if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } -- cgit v1.1 From f31e11d87a5d7601636710195891ba462ad99f11 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Aug 2008 16:54:44 -0700 Subject: wait_task_inactive(): don't consider task->nivcsw If wait_task_inactive() returns success the task was deactivated. In that case schedule() always increments ->nvcsw which alone can be used as a "generation counter". If the next call returns the same number, we can be sure that the task was unscheduled. Otherwise, because we know that .on_rq == 0 again, ->nvcsw should have been changed in between. Q: perhaps it is better to do "ncsw = (p->nvcsw << 1) | 1" ? This decreases the possibility of "was it unscheduled" false positive when ->nvcsw == 0. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index da7c5d2..908670a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1921,11 +1921,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) running = task_running(rq, p); on_rq = p->se.on_rq; ncsw = 0; - if (!match_state || p->state == match_state) { - ncsw = p->nivcsw + p->nvcsw; - if (unlikely(!ncsw)) - ncsw = 1; - } + if (!match_state || p->state == match_state) + ncsw = p->nvcsw ?: 1; task_rq_unlock(rq, &flags); /* -- cgit v1.1 From 93dcf55f828b035fc93fc19eb03c1390e1e6d570 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Aug 2008 16:54:44 -0700 Subject: wait_task_inactive: "improve" the returned value for ->nvcsw == 0 wait_task_inactive() returns 1 when p->nvcsw == 0 || p->nvcsw == 1. This means that two subsequent calls can return the same number while the task was scheduled in between. Change the code to return "nvcsw | LONG_MIN" instead of "nvcsw ?: 1", now the overlap always needs LONG_MAX schedules. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 908670a..6a43c89 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1922,7 +1922,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) on_rq = p->se.on_rq; ncsw = 0; if (!match_state || p->state == match_state) - ncsw = p->nvcsw ?: 1; + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, &flags); /* -- cgit v1.1 From 7a8fc9b248e77a4eab0613acf30a6811799786b3 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sun, 17 Aug 2008 17:36:59 +0300 Subject: removed unused #include 's This patch lets the files using linux/version.h match the files that #include it. Signed-off-by: Adrian Bunk Signed-off-by: Linus Torvalds --- kernel/nsproxy.c | 1 - kernel/power/swap.c | 1 - kernel/user_namespace.c | 1 - kernel/utsname.c | 1 - kernel/utsname_sysctl.c | 1 - 5 files changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 21575fc..1d3ef29 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -14,7 +14,6 @@ */ #include -#include #include #include #include diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a0abf9a..80ccac8 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index a9ab059..532858f 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -6,7 +6,6 @@ */ #include -#include #include #include #include diff --git a/kernel/utsname.c b/kernel/utsname.c index 64d398f..815237a 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index fe3a56c..4ab9659 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -12,7 +12,6 @@ #include #include #include -#include #include static void *get_uts(ctl_table *table, int write) -- cgit v1.1 From 354879bb977e06695993435745f06a0f6d39ce2b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Aug 2008 17:15:34 +0200 Subject: sched_clock: fix cpu_clock() This patch fixes 3 issues: a) it removes the dependency on jiffies, because jiffies are incremented by a single CPU, and the tick is not synchronized between CPUs. Therefore relying on it to calculate a window to clip whacky TSC values doesn't work as it can drift around. So instead use [GTOD, GTOD+TICK_NSEC) as the window. b) __update_sched_clock() did (roughly speaking): delta = sched_clock() - scd->tick_raw; clock += delta; Which gives exponential growth, instead of linear. c) allows the sched_clock_cpu() value to warp the u64 without breaking. the results are more reliable sched_clock() deltas: before after sched_clock cpu_clock: 15750 51312 51488 cpu_clock: 59719 51052 50947 cpu_clock: 15879 51249 51061 cpu_clock: 1 50933 51198 cpu_clock: 1 50931 51039 cpu_clock: 1 51093 50981 cpu_clock: 1 51043 51040 cpu_clock: 1 50959 50938 cpu_clock: 1 50981 51011 cpu_clock: 1 51364 51212 cpu_clock: 1 51219 51273 cpu_clock: 1 51389 51048 cpu_clock: 1 51285 51611 cpu_clock: 1 50964 51137 cpu_clock: 1 50973 50968 cpu_clock: 1 50967 50972 cpu_clock: 1 58910 58485 cpu_clock: 1 51082 51025 cpu_clock: 1 50957 50958 cpu_clock: 1 50958 50957 cpu_clock: 1006128 51128 50971 cpu_clock: 1 51107 51155 cpu_clock: 1 51371 51081 cpu_clock: 1 51104 51365 cpu_clock: 1 51363 51309 cpu_clock: 1 51107 51160 cpu_clock: 1 51139 51100 cpu_clock: 1 51216 51136 cpu_clock: 1 51207 51215 cpu_clock: 1 51087 51263 cpu_clock: 1 51249 51177 cpu_clock: 1 51519 51412 cpu_clock: 1 51416 51255 cpu_clock: 1 51591 51594 cpu_clock: 1 50966 51374 cpu_clock: 1 50966 50966 cpu_clock: 1 51291 50948 cpu_clock: 1 50973 50867 cpu_clock: 1 50970 50970 cpu_clock: 998306 50970 50971 cpu_clock: 1 50971 50970 cpu_clock: 1 50970 50970 cpu_clock: 1 50971 50971 cpu_clock: 1 50970 50970 cpu_clock: 1 51351 50970 cpu_clock: 1 50970 51352 cpu_clock: 1 50971 50970 cpu_clock: 1 50970 50970 cpu_clock: 1 51321 50971 cpu_clock: 1 50974 51324 Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 84 +++++++++++++++++++++------------------------------- 1 file changed, 34 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 204991a..e8ab096 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -12,19 +12,17 @@ * * Create a semi stable clock from a mixture of other events, including: * - gtod - * - jiffies * - sched_clock() * - explicit idle events * * We use gtod as base and the unstable clock deltas. The deltas are filtered, - * making it monotonic and keeping it within an expected window. This window - * is set up using jiffies. + * making it monotonic and keeping it within an expected window. * * Furthermore, explicit sleep and wakeup hooks allow us to account for time * that is otherwise invisible (TSC gets stopped). * * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat - * consistent between cpus (never more than 1 jiffies difference). + * consistent between cpus (never more than 2 jiffies difference). */ #include #include @@ -54,7 +52,6 @@ struct sched_clock_data { */ raw_spinlock_t lock; - unsigned long tick_jiffies; u64 tick_raw; u64 tick_gtod; u64 clock; @@ -75,14 +72,12 @@ static inline struct sched_clock_data *cpu_sdc(int cpu) void sched_clock_init(void) { u64 ktime_now = ktime_to_ns(ktime_get()); - unsigned long now_jiffies = jiffies; int cpu; for_each_possible_cpu(cpu) { struct sched_clock_data *scd = cpu_sdc(cpu); scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - scd->tick_jiffies = now_jiffies; scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; @@ -92,46 +87,51 @@ void sched_clock_init(void) } /* + * min,max except they take wrapping into account + */ + +static inline u64 wrap_min(u64 x, u64 y) +{ + return (s64)(x - y) < 0 ? x : y; +} + +static inline u64 wrap_max(u64 x, u64 y) +{ + return (s64)(x - y) > 0 ? x : y; +} + +/* * update the percpu scd from the raw @now value * * - filter out backward motion - * - use jiffies to generate a min,max window to clip the raw values + * - use the GTOD tick value to create a window to filter crazy TSC values */ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) { - unsigned long now_jiffies = jiffies; - long delta_jiffies = now_jiffies - scd->tick_jiffies; - u64 clock = scd->clock; - u64 min_clock, max_clock; s64 delta = now - scd->tick_raw; + u64 clock, min_clock, max_clock; WARN_ON_ONCE(!irqs_disabled()); - min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; - if (unlikely(delta < 0)) { - clock++; - goto out; - } + if (unlikely(delta < 0)) + delta = 0; - max_clock = min_clock + TICK_NSEC; + /* + * scd->clock = clamp(scd->tick_gtod + delta, + * max(scd->tick_gtod, scd->clock), + * scd->tick_gtod + TICK_NSEC); + */ - if (unlikely(clock + delta > max_clock)) { - if (clock < max_clock) - clock = max_clock; - else - clock++; - } else { - clock += delta; - } + clock = scd->tick_gtod + delta; + min_clock = wrap_max(scd->tick_gtod, scd->clock); + max_clock = scd->tick_gtod + TICK_NSEC; - out: - if (unlikely(clock < min_clock)) - clock = min_clock; + clock = wrap_max(clock, min_clock); + clock = wrap_min(clock, max_clock); - scd->tick_jiffies = now_jiffies; scd->clock = clock; - return clock; + return scd->clock; } static void lock_double_clock(struct sched_clock_data *data1, @@ -171,7 +171,7 @@ u64 sched_clock_cpu(int cpu) * larger time as the latest time for both * runqueues. (this creates monotonic movement) */ - if (likely(remote_clock < this_clock)) { + if (likely((s64)(remote_clock - this_clock) < 0)) { clock = this_clock; scd->clock = clock; } else { @@ -207,14 +207,9 @@ void sched_clock_tick(void) now = sched_clock(); __raw_spin_lock(&scd->lock); - __update_sched_clock(scd, now); - /* - * update tick_gtod after __update_sched_clock() because that will - * already observe 1 new jiffy; adding a new tick_gtod to that would - * increase the clock 2 jiffies. - */ scd->tick_raw = now; scd->tick_gtod = now_gtod; + __update_sched_clock(scd, now); __raw_spin_unlock(&scd->lock); } @@ -232,18 +227,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); */ void sched_clock_idle_wakeup_event(u64 delta_ns) { - struct sched_clock_data *scd = this_scd(); - - /* - * Override the previous timestamp and ignore all - * sched_clock() deltas that occured while we idled, - * and use the PM-provided delta_ns to advance the - * rq clock: - */ - __raw_spin_lock(&scd->lock); - scd->clock += delta_ns; - __raw_spin_unlock(&scd->lock); - + sched_clock_tick(); touch_softlockup_watchdog(); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -- cgit v1.1 From ffb4ba76a25ab6c9deeec33e4f58395586ca747c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 25 Aug 2008 11:10:26 -0700 Subject: [module] Don't let gcc inline load_module() 'load_module()' is a complex function that contains all the ELF section logic, and inlining it is utterly insane. But gcc will do it, simply because there is only one call-site. As a result, all the stack space that is allocated for all the work to load the module will still be active when we actually call the module init sequence, and the deep call chain makes stack overflows happen. And stack overflows are really hard to debug, because they not only corrupt random pages below the stack, but also corrupt the thread_info structure that is allocated under the stack. In this case, Alan Brunelle reported some crazy oopses at bootup, after loading the processor module that ends up doing complex ACPI stuff and has quite a deep callchain. This should fix it, and is the sane thing to do regardless. Cc: Alan D. Brunelle Cc: Arjan van de Ven Cc: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 08864d2..9db1191 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1799,7 +1799,7 @@ static void *module_alloc_update_bounds(unsigned long size) /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ -static struct module *load_module(void __user *umod, +static noinline struct module *load_module(void __user *umod, unsigned long len, const char __user *uargs) { -- cgit v1.1 From f73be6dedf4fa058ce80846dae604b08fa805ca1 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 25 Aug 2008 17:07:14 -0700 Subject: smp: have smp_call_function_single() detect invalid CPUs Have smp_call_function_single() return invalid CPU indicies and return -ENXIO. This function is already executed inside a get_cpu()..put_cpu() which locks out CPU removal, so rather than having the higher layers doing another layer of locking to guard against unplugged CPUs do the test here. Signed-off-by: H. Peter Anvin --- kernel/smp.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 782e2b9..f362a85 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -210,8 +210,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, { struct call_single_data d; unsigned long flags; - /* prevent preemption and reschedule on another processor */ + /* prevent preemption and reschedule on another processor, + as well as CPU removal */ int me = get_cpu(); + int err = 0; /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -220,7 +222,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, local_irq_save(flags); func(info); local_irq_restore(flags); - } else { + } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) { struct call_single_data *data = NULL; if (!wait) { @@ -236,10 +238,12 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, data->func = func; data->info = info; generic_exec_single(cpu, data); + } else { + err = -ENXIO; /* CPU not online */ } put_cpu(); - return 0; + return err; } EXPORT_SYMBOL(smp_call_function_single); -- cgit v1.1 From 65eb3dc609dec17deea48dcd4de2e549d29a9824 Mon Sep 17 00:00:00 2001 From: Kevin Diggs Date: Tue, 26 Aug 2008 10:26:54 +0200 Subject: sched: add kernel doc for the completion, fix kernel-doc-nano-HOWTO.txt This patch adds kernel doc for the completion feature. An error in the split-man.pl PERL snippet in kernel-doc-nano-HOWTO.txt is also fixed. Signed-off-by: Kevin Diggs Signed-off-by: Ingo Molnar --- kernel/sched.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 29e2ec0..93f5ea0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4565,6 +4565,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + */ void complete(struct completion *x) { unsigned long flags; @@ -4576,6 +4585,12 @@ void complete(struct completion *x) } EXPORT_SYMBOL(complete); +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + */ void complete_all(struct completion *x) { unsigned long flags; @@ -4624,12 +4639,31 @@ wait_for_common(struct completion *x, long timeout, int state) return timeout; } +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ void __sched wait_for_completion(struct completion *x) { wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion); +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) { @@ -4637,6 +4671,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) } EXPORT_SYMBOL(wait_for_completion_timeout); +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + */ int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); @@ -4646,6 +4687,14 @@ int __sched wait_for_completion_interruptible(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_interruptible); +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + */ unsigned long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) @@ -4654,6 +4703,13 @@ wait_for_completion_interruptible_timeout(struct completion *x, } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + */ int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); -- cgit v1.1 From 2189459d25a47401c69a17794c9d390c890351f9 Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Mon, 25 Aug 2008 17:15:33 -0400 Subject: lockstat: fix numerical output rounding error Fix rounding error in /proc/lock_stat numerical output. On occasion the two digit fractional part contains the three digit value '100'. This is due to a bug in the rounding algorithm which pushes values in the range '95..99' to '100' rather than to '00' + an increment to the integer part. For example, - 123456.100 old display + 123457.00 new display Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 4b194d3..20dbcbf 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -472,8 +472,9 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr) { unsigned long rem; + nr += 5; /* for display rounding */ rem = do_div(nr, 1000); /* XXX: do_div_signed */ - snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10); + snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); } static void seq_time(struct seq_file *m, s64 time) -- cgit v1.1 From 04148b73b89d49fe0fe201bcee395e51f7d637ce Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Mon, 25 Aug 2008 17:16:23 -0400 Subject: lockstat: repair erronous contention statistics Fix bad contention counting in /proc/lock_stat. /proc/lockstat tries to gather per-ip contention statistics per-lock. This was failing due to a garbage per-ip index selector being used. Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3bfb187..b5db51d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3029,7 +3029,7 @@ found_it: stats = get_lock_stats(hlock_class(hlock)); if (point < ARRAY_SIZE(stats->contention_point)) - stats->contention_point[i]++; + stats->contention_point[point]++; if (lock->cpu != smp_processor_id()) stats->bounces[bounce_contended + !!hlock->read]++; put_lock_stats(stats); -- cgit v1.1 From 74870172824a78640ec4f03058d9bd35dfa08618 Mon Sep 17 00:00:00 2001 From: Zhu Yi Date: Wed, 27 Aug 2008 14:33:00 +0800 Subject: lockdep: fix invalid list_del_rcu in zap_class The problem is found during iwlagn driver testing on v2.6.27-rc4-176-gb8e6c91 kernel, but it turns out to be a lockdep bug. In our testing, we frequently load and unload the iwlagn driver (>50 times). Then the MAX_STACK_TRACE_ENTRIES is reached (expected behaviour?). The error message with the call trace is as below. BUG: MAX_STACK_TRACE_ENTRIES too low! turning off the locking correctness validator. Pid: 4895, comm: iwlagn Not tainted 2.6.27-rc4 #13 Call Trace: [] save_stack_trace+0x22/0x3e [] save_trace+0x8b/0x91 [] mark_lock+0x1b0/0x8fa [] __lock_acquire+0x5b9/0x716 [] ieee80211_sta_work+0x0/0x6ea [mac80211] [] lock_acquire+0x52/0x6b [] run_workqueue+0x97/0x1ed [] run_workqueue+0xe7/0x1ed [] run_workqueue+0x97/0x1ed [] worker_thread+0xd8/0xe3 [] autoremove_wake_function+0x0/0x2e [] worker_thread+0x0/0xe3 [] kthread+0x47/0x73 [] trace_hardirqs_on_thunk+0x3a/0x3f [] child_rip+0xa/0x11 [] restore_args+0x0/0x30 [] finish_task_switch+0x0/0xcc [] kthread+0x0/0x73 [] child_rip+0x0/0x11 Although the above is harmless, when the ilwagn module is removed later lockdep will trigger a kernel oops as below. BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] zap_class+0x24/0x82 PGD 73128067 PUD 7448c067 PMD 0 Oops: 0002 [1] SMP CPU 0 Modules linked in: rfcomm l2cap bluetooth autofs4 sunrpc nf_conntrack_ipv6 xt_state nf_conntrack xt_tcpudp ip6t_ipv6header ip6t_REJECT ip6table_filter ip6_tables x_tables ipv6 cpufreq_ondemand acpi_cpufreq dm_mirror dm_log dm_multipath dm_mod snd_hda_intel sr_mod snd_seq_dummy snd_seq_oss snd_seq_midi_event battery snd_seq snd_seq_device cdrom button snd_pcm_oss snd_mixer_oss snd_pcm snd_timer snd_page_alloc e1000e snd_hwdep sg iTCO_wdt iTCO_vendor_support ac pcspkr i2c_i801 i2c_core snd soundcore video output ata_piix ata_generic libata sd_mod scsi_mod ext3 jbd mbcache uhci_hcd ohci_hcd ehci_hcd [last unloaded: mac80211] Pid: 4941, comm: modprobe Not tainted 2.6.27-rc4 #10 RIP: 0010:[] [] zap_class+0x24/0x82 RSP: 0000:ffff88007bcb3eb0 EFLAGS: 00010046 RAX: 0000000000068ee8 RBX: ffffffff8192a0a0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000001dfb RDI: ffffffff816e70b0 RBP: ffffffffa00cd000 R08: ffffffff816818f8 R09: ffff88007c923558 R10: ffffe20002ad2408 R11: ffffffff811028ec R12: ffffffff8192a0a0 R13: 000000000002bd90 R14: 0000000000000000 R15: 0000000000000296 FS: 00007f9d1cee56f0(0000) GS:ffffffff814a58c0(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000008 CR3: 0000000073047000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process modprobe (pid: 4941, threadinfo ffff88007bcb2000, task ffff8800758d1fc0) Stack: ffffffff81057376 0000000000000000 ffffffffa00f7b00 0000000000000000 0000000000000080 0000000000618278 00007fff24f16720 0000000000000000 ffffffff8105d37a ffffffffa00f7b00 ffffffff8105d591 313132303863616d Call Trace: [] ? lockdep_free_key_range+0x61/0xf5 [] ? free_module+0xd4/0xe4 [] ? sys_delete_module+0x1de/0x1f9 [] ? audit_syscall_entry+0x12d/0x160 [] ? system_call_fastpath+0x16/0x1b Code: b2 00 01 00 00 00 c3 31 f6 49 c7 c0 10 8a 61 81 eb 32 49 39 38 75 26 48 98 48 6b c0 38 48 8b 90 08 8a 61 81 48 8b 88 00 8a 61 81 <48> 89 51 08 48 89 0a 48 c7 80 08 8a 61 81 00 02 20 00 48 ff c6 RIP [] zap_class+0x24/0x82 RSP CR2: 0000000000000008 ---[ end trace a1297e0c4abb0f2e ]--- The root cause for this oops is in add_lock_to_list() when save_trace() fails due to MAX_STACK_TRACE_ENTRIES is reached, entry->class is assigned but entry is never added into any lock list. This makes the list_del_rcu() in zap_class() oops later when the module is unloaded. This patch fixes the problem by assigning entry->class after save_trace() returns success. Signed-off-by: Zhu Yi Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b5db51d..dbda475 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -875,11 +875,11 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, if (!entry) return 0; - entry->class = this; - entry->distance = distance; if (!save_trace(&entry->trace)) return 0; + entry->class = this; + entry->distance = distance; /* * Since we never remove from the dependency list, the list can * be walked lockless by other CPUs, it's only allocation -- cgit v1.1 From 2633f0e57b1127f4060d70bf460140dc9bb19386 Mon Sep 17 00:00:00 2001 From: Steve VanDeBogart Date: Tue, 26 Aug 2008 15:14:36 -0700 Subject: exit signals: use of uninitialized field notify_count task->signal->notify_count is only initialized if task->signal->group_exit_task is not NULL. Reorder a conditional so that uninitialised memory is not used. Found by Valgrind. Signed-off-by: Steve VanDeBogart Signed-off-by: Ingo Molnar --- kernel/exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 38ec406..75c6473 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -918,8 +918,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && - tsk->signal->notify_count < 0 && - tsk->signal->group_exit_task) + tsk->signal->group_exit_task && + tsk->signal->notify_count < 0) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); -- cgit v1.1 From 0cd418ddb1ee88df7d16d5df06cb2da68eceb9e4 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 18 Aug 2008 14:39:21 -0700 Subject: rcuclassic: fix compiler warning CC kernel/rcuclassic.o kernel/rcuclassic.c: In function 'rcu_init_percpu_data': kernel/rcuclassic.c:705: warning: comparison of distinct pointer types lacks a cast kernel/rcuclassic.c:713: warning: comparison of distinct pointer types lacks a cast flags should be unsigned long. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 3f691896..743cf05 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -714,7 +714,7 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - long flags; + unsigned long flags; spin_lock_irqsave(&rcp->lock, flags); memset(rdp, 0, sizeof(*rdp)); -- cgit v1.1 From f42ac38c59e0a03d6da0c24a63fb211393f484b0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 27 Aug 2008 09:14:40 -0400 Subject: ftrace: disable tracing for suspend to ram I've been painstakingly debugging the issue with suspend to ram and ftraced. The 2.6.28 code does not have this issue, but since the mcount recording is not going to be in 27, this must be solved for the ftrace daemon version. The resume from suspend to ram would reboot because it was triple faulting. Debugging further, I found that calling the mcount function itself was not an issue, but it would fault when it incremented preempt_count. preempt_count is on the tasks info structure that is on the low memory address of the task's stack. For some reason, it could not write to it. Resuming out of suspend to ram does quite a lot of funny tricks to get to work, so it is not surprising at all that simply doing a preempt_disable() would cause a fault. Thanks to Rafael for suggesting to add a "while (1);" to find the place in resuming that is causing the fault. I would place the loop somewhere in the code, compile and reboot and see if it would either reboot (hit the fault) or simply hang (hit the loop). Doing this over and over again, I narrowed it down that it was happening in enable_nonboot_cpus. At this point, I found that it is easier to simply disable tracing around the suspend code, instead of searching for the particular function that can not handle doing a preempt_disable. This patch disables the tracer as it suspends and reenables it on resume. I tested this patch on my Laptop, and it can resume fine with the patch. Signed-off-by: Steven Rostedt Acked-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- kernel/power/main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 0b7476f..540b16b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "power.h" @@ -310,7 +311,7 @@ static int suspend_enter(suspend_state_t state) */ int suspend_devices_and_enter(suspend_state_t state) { - int error; + int error, ftrace_save; if (!suspend_ops) return -ENOSYS; @@ -321,6 +322,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + ftrace_save = __ftrace_enabled_save(); suspend_test_start(); error = device_suspend(PMSG_SUSPEND); if (error) { @@ -352,6 +354,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); device_resume(PMSG_RESUME); suspend_test_finish("resume devices"); + __ftrace_enabled_restore(ftrace_save); resume_console(); Close: if (suspend_ops->end) -- cgit v1.1 From f3ade837808121ff8bab9c56725f4fe40ec85a56 Mon Sep 17 00:00:00 2001 From: John Blackwood Date: Tue, 26 Aug 2008 15:09:43 -0400 Subject: sched: fix sched_rt_rq_enqueue() resched idle When sysctl_sched_rt_runtime is set to something other than -1 and the CONFIG_RT_GROUP_SCHED kernel parameter is NOT enabled, we get into a state where we see one or more CPUs idling forvever even though there are real-time tasks in their rt runqueue that are able to run (no longer throttled). The sequence is: - A real-time task is running when the timer sets the rt runqueue to throttled, and the rt task is resched_task()ed and switched out, and idle is switched in since there are no non-rt tasks to run on that cpu. - Eventually the do_sched_rt_period_timer() runs and un-throttles the rt runqueue, but we just exit the timer interrupt and go back to executing the idle task in the idle loop forever. If we change the sched_rt_rq_enqueue() routine to use some of the code from the CONFIG_RT_GROUP_SCHED enabled version of this same routine and resched_task() the currently executing task (idle in our case) if it is a lower priority task than the higher rt task in the now un-throttled runqueue, the problem is no longer observed. Signed-off-by: John Blackwood Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 998ba54b..07d9b33 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -199,6 +199,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { + if (rt_rq->rt_nr_running) + resched_task(rq_of_rt_rq(rt_rq)->curr); } static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) -- cgit v1.1 From aec0a5142cb52aaa152d962d84a838e25d520742 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Thu, 28 Aug 2008 14:42:49 +0530 Subject: sched: call resched_task() conditionally from new task wake up path - During wake up of a new task, task_new_fair() can do a resched_task() on the current task. Later in the code path, check_preempt_curr() also ends up doing the same, which can be avoided. Check if TIF_NEED_RESCHED is already set for the current task. - task_new_fair() does a resched_task() on the current task unconditionally. This can be done only in case when child runs before the parent. So this is a small speedup. Signed-off-by: Bharata B Rao Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fb8994c..8264bb5 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1348,6 +1348,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (unlikely(se == pse)) return; + /* + * We can come here with TIF_NEED_RESCHED already set from new task + * wake up path. + */ + if (test_tsk_need_resched(curr)) + return; + cfs_rq_of(pse)->next = pse; /* @@ -1620,10 +1627,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); + resched_task(rq->curr); } enqueue_task_fair(rq, p, 0); - resched_task(rq->curr); } /* -- cgit v1.1 From 29cbef4869bf288256ab76c7dc674cb132b35de2 Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Wed, 27 Aug 2008 11:21:39 -0400 Subject: make might_sleep() display the oopsing process Expand might_sleep's printk to indicate the oopsing process. Signed-off-by: Joe Korty Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 93f5ea0..6e283dc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8233,8 +8233,8 @@ void __might_sleep(char *file, int line) prev_jiffy = jiffies; printk(KERN_ERR "BUG: sleeping function called from invalid" " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); + printk("in_atomic():%d, irqs_disabled():%d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), current->pid, current->comm); debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); -- cgit v1.1 From aef745fca016aea45adae5c98e8698904dd8ad51 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Aug 2008 11:34:43 +0200 Subject: sched: clean up __might_sleep() add KERN_ to the printout and clean up the flow a bit. Signed-off-by: Ingo Molnar --- kernel/sched.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6e283dc..b112caa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8226,20 +8226,25 @@ void __might_sleep(char *file, int line) #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ - if ((in_atomic() || irqs_disabled()) && - system_state == SYSTEM_RUNNING && !oops_in_progress) { - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - printk(KERN_ERR "BUG: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), current->pid, current->comm); - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); - } + if ((!in_atomic() && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) + return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); + dump_stack(); #endif } EXPORT_SYMBOL(__might_sleep); -- cgit v1.1 From 7940ca3605b77f20cc6e9852e4ca6f2d725b5653 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 Aug 2008 13:40:47 +0200 Subject: sched: extract walk_tg_tree(), fix fix: kernel/sched.c: In function '__rt_schedulable': kernel/sched.c:8771: error: implicit declaration of function 'walk_tg_tree' kernel/sched.c:8771: error: 'tg_nop' undeclared (first use in this function) kernel/sched.c:8771: error: (Each undeclared identifier is reported only once kernel/sched.c:8771: error: for each function it appears in.) Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e41bdae..703f56d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1387,7 +1387,7 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(SCHED_RT_GROUP_SCHED) +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) typedef int (*tg_visitor)(struct task_group *, void *); /* -- cgit v1.1 From cc2991cf15ae92fa30b3ea9f56a8a5a337bd33c7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:03 +0200 Subject: sched: rt-bandwidth accounting fix It fixes an accounting bug where we would continue accumulating runtime even though the bandwidth control is disabled. This would lead to very long throttle periods once bandwidth control gets turned on again. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 07d9b33..5523107 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -440,9 +440,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); - if (runtime == RUNTIME_INF) - return 0; - if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); @@ -493,9 +490,11 @@ static void update_curr_rt(struct rq *rq) rt_rq = rt_rq_of_se(rt_se); spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { + rt_rq->rt_time += delta_exec; + if (sched_rt_runtime_exceeded(rt_rq)) + resched_task(curr); + } spin_unlock(&rt_rq->rt_runtime_lock); } } -- cgit v1.1 From 41108eb10142e0552f2de1e4c0675b108c5f018f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Aug 2008 14:39:12 +0200 Subject: ftrace: disable tracing for hibernation In accordance with commit f42ac38c59e0a03d6da0c24a63fb211393f484b0 ("ftrace: disable tracing for suspend to ram"), disable tracing around the suspend code in hibernation code paths. Signed-off-by: Rafael J. Wysocki Acked-by: Steven Rostedt Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index f011e08..bbd85c6 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "power.h" @@ -255,7 +256,7 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { - int error; + int error, ftrace_save; /* Free memory before shutting down devices. */ error = swsusp_shrink_memory(); @@ -267,6 +268,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); + ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_FREEZE); if (error) goto Recover_platform; @@ -296,6 +298,7 @@ int hibernation_snapshot(int platform_mode) Resume_devices: device_resume(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + __ftrace_enabled_restore(ftrace_save); resume_console(); Close: platform_end(platform_mode); @@ -366,10 +369,11 @@ static int resume_target_kernel(void) int hibernation_restore(int platform_mode) { - int error; + int error, ftrace_save; pm_prepare_console(); suspend_console(); + ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_QUIESCE); if (error) goto Finish; @@ -384,6 +388,7 @@ int hibernation_restore(int platform_mode) platform_restore_cleanup(platform_mode); device_resume(PMSG_RECOVER); Finish: + __ftrace_enabled_restore(ftrace_save); resume_console(); pm_restore_console(); return error; @@ -396,7 +401,7 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { - int error; + int error, ftrace_save; if (!hibernation_ops) return -ENOSYS; @@ -411,6 +416,7 @@ int hibernation_platform_enter(void) goto Close; suspend_console(); + ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -445,6 +451,7 @@ int hibernation_platform_enter(void) hibernation_ops->finish(); Resume_devices: device_resume(PMSG_RESTORE); + __ftrace_enabled_restore(ftrace_save); resume_console(); Close: hibernation_ops->end(); -- cgit v1.1 From 316d9679f33caf7e683471647d1472bfe133d858 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 29 Aug 2008 20:06:23 +0200 Subject: Don't trigger softlockup detector on network fs blocked tasks Pulling the ethernet cable on a 2.6.27-rc system with NFS mounts currently leads to an ongoing flood of soft lockup detector backtraces for all tasks blocked on the NFS mounts when the hickup takes longer than 120s. I don't think NFS problems should be all that noisy. Luckily there's a reasonably easy way to distingush this case. Don't report task softlockup warnings for tasks in TASK_KILLABLE state, which is used by the network file systems. I believe this patch is a 2.6.27 candidate. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/softlockup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index b75b492..1a07f8c 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -180,6 +180,10 @@ static void check_hung_task(struct task_struct *t, unsigned long now) if (t->flags & PF_FROZEN) return; + /* Don't check for tasks waiting on network file systems like NFS */ + if (t->state & TASK_KILLABLE) + return; + if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { t->last_switch_count = switch_count; t->last_switch_timestamp = now; -- cgit v1.1 From bef69ea0dcce574a425feb0a5aa4c63dd108b9a6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 29 Aug 2008 20:18:31 -0700 Subject: Resource handling: add 'insert_resource_expand_to_fit()' function Not used anywhere yet, but this complements the existing plain 'insert_resource()' functionality with a version that can expand the resource we are adding in order to fix up any conflicts it has with existing resources. Signed-off-by: Linus Torvalds --- kernel/resource.c | 88 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index f5b518e..cf0a178 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -362,35 +362,21 @@ int allocate_resource(struct resource *root, struct resource *new, EXPORT_SYMBOL(allocate_resource); -/** - * insert_resource - Inserts a resource in the resource tree - * @parent: parent of the new resource - * @new: new resource to insert - * - * Returns 0 on success, -EBUSY if the resource can't be inserted. - * - * This function is equivalent to request_resource when no conflict - * happens. If a conflict happens, and the conflicting resources - * entirely fit within the range of the new resource, then the new - * resource is inserted and the conflicting resources become children of - * the new resource. +/* + * Insert a resource into the resource tree. If successful, return NULL, + * otherwise return the conflicting resource (compare to __request_resource()) */ -int insert_resource(struct resource *parent, struct resource *new) +static struct resource * __insert_resource(struct resource *parent, struct resource *new) { - int result; struct resource *first, *next; - write_lock(&resource_lock); - for (;; parent = first) { - result = 0; first = __request_resource(parent, new); if (!first) - goto out; + return first; - result = -EBUSY; if (first == parent) - goto out; + return first; if ((first->start > new->start) || (first->end < new->end)) break; @@ -401,15 +387,13 @@ int insert_resource(struct resource *parent, struct resource *new) for (next = first; ; next = next->sibling) { /* Partial overlap? Bad, and unfixable */ if (next->start < new->start || next->end > new->end) - goto out; + return next; if (!next->sibling) break; if (next->sibling->start > new->end) break; } - result = 0; - new->parent = parent; new->sibling = next->sibling; new->child = first; @@ -426,10 +410,64 @@ int insert_resource(struct resource *parent, struct resource *new) next = next->sibling; next->sibling = new; } + return NULL; +} - out: +/** + * insert_resource - Inserts a resource in the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Returns 0 on success, -EBUSY if the resource can't be inserted. + * + * This function is equivalent to request_resource when no conflict + * happens. If a conflict happens, and the conflicting resources + * entirely fit within the range of the new resource, then the new + * resource is inserted and the conflicting resources become children of + * the new resource. + */ +int insert_resource(struct resource *parent, struct resource *new) +{ + struct resource *conflict; + + write_lock(&resource_lock); + conflict = __insert_resource(parent, new); + write_unlock(&resource_lock); + return conflict ? -EBUSY : 0; +} + +/** + * insert_resource_expand_to_fit - Insert a resource into the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Insert a resource into the resource tree, possibly expanding it in order + * to make it encompass any conflicting resources. + */ +void insert_resource_expand_to_fit(struct resource *root, struct resource *new) +{ + if (new->parent) + return; + + write_lock(&resource_lock); + for (;;) { + struct resource *conflict; + + conflict = __insert_resource(root, new); + if (!conflict) + break; + if (conflict == root) + break; + + /* Ok, expand resource to cover the conflict, then try again .. */ + if (conflict->start < new->start) + new->start = conflict->start; + if (conflict->end > new->end) + new->end = conflict->end; + + printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); + } write_unlock(&resource_lock); - return result; } /** -- cgit v1.1 From c4bacefb7aaf49da11a695f29d85d40909f17693 Mon Sep 17 00:00:00 2001 From: Cordelia Date: Mon, 18 Aug 2008 09:45:51 -0700 Subject: [PATCH] audit: Moved variable declaration to beginning of function got rid of compilation warning: ISO C90 forbids mixed declarations and code Signed-off-by: Cordelia Sam Signed-off-by: Al Viro --- kernel/auditsc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 972f8e6..59cedfb 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -243,10 +243,11 @@ static inline int open_arg(int flags, int mask) static int audit_match_perm(struct audit_context *ctx, int mask) { + unsigned n; if (unlikely(!ctx)) return 0; - unsigned n = ctx->major; + n = ctx->major; switch (audit_classify_syscall(ctx->arch, n)) { case 0: /* native */ if ((mask & AUDIT_PERM_WRITE) && -- cgit v1.1 From 6781f4ae30bbb8ebf31187b3c9304be16966f5a0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Aug 2008 20:31:55 -0700 Subject: kernel/resource.c: fix new kernel-doc warning Fix kernel-doc warning for new function: Warning(linux-2.6.27-rc5-git2//kernel/resource.c:448): No description found for parameter 'root' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index cf0a178..03d796c 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -438,7 +438,7 @@ int insert_resource(struct resource *parent, struct resource *new) /** * insert_resource_expand_to_fit - Insert a resource into the resource tree - * @parent: parent of the new resource + * @root: root resource descriptor * @new: new resource to insert * * Insert a resource into the resource tree, possibly expanding it in order -- cgit v1.1 From cbaed698f37494b30b2449b51c728ae48630cb2b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 30 Aug 2008 21:08:40 +0400 Subject: softlockup: minor cleanup, don't check task->state twice The recent commit 16d9679f33caf7e683471647d1472bfe133d858 changed check_hung_task() to filter out the TASK_KILLABLE tasks. We can move this check to the caller which has to test t->state anyway. Signed-off-by: Oleg Nesterov Acked-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/softlockup.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 1a07f8c..cb838ee 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -180,10 +180,6 @@ static void check_hung_task(struct task_struct *t, unsigned long now) if (t->flags & PF_FROZEN) return; - /* Don't check for tasks waiting on network file systems like NFS */ - if (t->state & TASK_KILLABLE) - return; - if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { t->last_switch_count = switch_count; t->last_switch_timestamp = now; @@ -237,7 +233,8 @@ static void check_hung_uninterruptible_tasks(int this_cpu) do_each_thread(g, t) { if (!--max_count) goto unlock; - if (t->state & TASK_UNINTERRUPTIBLE) + /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ + if (t->state == TASK_UNINTERRUPTIBLE) check_hung_task(t, now); } while_each_thread(g, t); unlock: -- cgit v1.1 From add0d4dfd660e9e4fd0af3eac3cad23583c9558f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 2 Sep 2008 14:35:48 -0700 Subject: pid_ns: zap_pid_ns_processes: fix the ->child_reaper changing zap_pid_ns_processes() sets pid_ns->child_reaper = NULL, this is wrong. Yes, we have already killed all tasks in this namespace, and sys_wait4() doesn't see any child. But this doesn't mean ->children list is empty, we may have EXIT_DEAD tasks which are not visible to do_wait(). In that case the subsequent forget_original_parent() will crash the kernel because it will try to re-parent these tasks to the NULL reaper. Even if there are no childs, it is not good that forget_original_parent() uses reaper == NULL. Change the code to set ->child_reaper = init_pid_ns.child_reaper instead. We could use pid_ns->parent->child_reaper as well, I think this does not really matter. These EXIT_DEAD tasks are not visible to the new ->parent after re-parenting, they will silently do release_task() eventually. Note that we must change ->child_reaper, otherwise forget_original_parent() will use reaper == father, and in that case we will hit the (correct) BUG_ON(!list_empty(&father->children)). Signed-off-by: Oleg Nesterov Acked-by: Serge Hallyn Acked-by: Sukadev Bhattiprolu Acked-by: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ea567b7..598f1ee 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -179,9 +179,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); - - /* Child reaper for the pid namespace is going away */ - pid_ns->child_reaper = NULL; + /* + * We can not clear ->child_reaper or leave it alone. + * There may by stealth EXIT_DEAD tasks on ->children, + * forget_original_parent() must move them somewhere. + */ + pid_ns->child_reaper = init_pid_ns.child_reaper; acct_exit_ns(pid_ns); return; } -- cgit v1.1 From 950bbabb5a804690a0201190de5c22837f72f83f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 2 Sep 2008 14:35:49 -0700 Subject: pid_ns: (BUG 11391) change ->child_reaper when init->group_leader exits We don't change pid_ns->child_reaper when the main thread of the subnamespace init exits. As Robert Rex pointed out this is wrong. Yes, the re-parenting itself works correctly, but if the reparented task exits it needs ->parent->nsproxy->pid_ns in do_notify_parent(), and if the main thread is zombie its ->nsproxy was already cleared by exit_task_namespaces(). Introduce the new function, find_new_reaper(), which finds the new ->parent for the re-parenting and changes ->child_reaper if needed. Kill the now unneeded exit_child_reaper(). Also move the changing of ->child_reaper from zap_pid_ns_processes() to find_new_reaper(), this consolidates the games with ->child_reaper and makes it stable under tasklist_lock. Addresses http://bugzilla.kernel.org/show_bug.cgi?id=11391 Reported-by: Robert Rex Signed-off-by: Oleg Nesterov Acked-by: Serge Hallyn Acked-by: Pavel Emelyanov Acked-by: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 78 ++++++++++++++++++++++---------------------------- kernel/pid_namespace.c | 6 ---- 2 files changed, 34 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 75c6473..25ed2ad 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -831,26 +831,50 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father) * the child reaper process (ie "init") in our pid * space. */ +static struct task_struct *find_new_reaper(struct task_struct *father) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(father); + struct task_struct *thread; + + thread = father; + while_each_thread(father, thread) { + if (thread->flags & PF_EXITING) + continue; + if (unlikely(pid_ns->child_reaper == father)) + pid_ns->child_reaper = thread; + return thread; + } + + if (unlikely(pid_ns->child_reaper == father)) { + write_unlock_irq(&tasklist_lock); + if (unlikely(pid_ns == &init_pid_ns)) + panic("Attempted to kill init!"); + + zap_pid_ns_processes(pid_ns); + write_lock_irq(&tasklist_lock); + /* + * We can not clear ->child_reaper or leave it alone. + * There may by stealth EXIT_DEAD tasks on ->children, + * forget_original_parent() must move them somewhere. + */ + pid_ns->child_reaper = init_pid_ns.child_reaper; + } + + return pid_ns->child_reaper; +} + static void forget_original_parent(struct task_struct *father) { - struct task_struct *p, *n, *reaper = father; + struct task_struct *p, *n, *reaper; LIST_HEAD(ptrace_dead); write_lock_irq(&tasklist_lock); - + reaper = find_new_reaper(father); /* * First clean up ptrace if we were using it. */ ptrace_exit(father, &ptrace_dead); - do { - reaper = next_thread(reaper); - if (reaper == father) { - reaper = task_child_reaper(father); - break; - } - } while (reaper->flags & PF_EXITING); - list_for_each_entry_safe(p, n, &father->children, sibling) { p->real_parent = reaper; if (p->parent == father) { @@ -959,39 +983,6 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -static inline void exit_child_reaper(struct task_struct *tsk) -{ - if (likely(tsk->group_leader != task_child_reaper(tsk))) - return; - - if (tsk->nsproxy->pid_ns == &init_pid_ns) - panic("Attempted to kill init!"); - - /* - * @tsk is the last thread in the 'cgroup-init' and is exiting. - * Terminate all remaining processes in the namespace and reap them - * before exiting @tsk. - * - * Note that @tsk (last thread of cgroup-init) may not necessarily - * be the child-reaper (i.e main thread of cgroup-init) of the - * namespace i.e the child_reaper may have already exited. - * - * Even after a child_reaper exits, we let it inherit orphaned children, - * because, pid_ns->child_reaper remains valid as long as there is - * at least one living sub-thread in the cgroup init. - - * This living sub-thread of the cgroup-init will be notified when - * a child inherited by the 'child-reaper' exits (do_notify_parent() - * uses __group_send_sig_info()). Further, when reaping child processes, - * do_wait() iterates over children of all living sub threads. - - * i.e even though 'child_reaper' thread is listed as the parent of the - * orphaned children, any living sub-thread in the cgroup-init can - * perform the role of the child_reaper. - */ - zap_pid_ns_processes(tsk->nsproxy->pid_ns); -} - NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; @@ -1051,7 +1042,6 @@ NORET_TYPE void do_exit(long code) } group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { - exit_child_reaper(tsk); hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 598f1ee..fab8ea8 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -179,12 +179,6 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); - /* - * We can not clear ->child_reaper or leave it alone. - * There may by stealth EXIT_DEAD tasks on ->children, - * forget_original_parent() must move them somewhere. - */ - pid_ns->child_reaper = init_pid_ns.child_reaper; acct_exit_ns(pid_ns); return; } -- cgit v1.1 From 9d3593574702ae1899e23a1535da1ac71f928042 Mon Sep 17 00:00:00 2001 From: John Kacur Date: Tue, 2 Sep 2008 14:36:13 -0700 Subject: pm_qos_requirement might sleep Make PM_QOS and CPU_IDLE play nicer when run with the RT-Preempt kernel. The purpose of the patch is to remove the spin_lock around the read in the function pm_qos_requirement - since spinlocks can sleep in -rt and this function is called from idle. CPU_IDLE polls the target_value's of some of the pm_qos parameters from the idle loop causing sleeping locking warnings. Changing the target_value to an atomic avoids this issue. Remove the spinlock in pm_qos_requirement by making target_value an atomic type. Signed-off-by: mark gross Signed-off-by: John Kacur Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pm_qos_params.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index da9c2dd..dfdec52 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -43,7 +43,7 @@ #include /* - * locking rule: all changes to target_value or requirements or notifiers lists + * locking rule: all changes to requirements or notifiers lists * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ @@ -66,7 +66,7 @@ struct pm_qos_object { struct miscdevice pm_qos_power_miscdev; char *name; s32 default_value; - s32 target_value; + atomic_t target_value; s32 (*comparitor)(s32, s32); }; @@ -77,7 +77,7 @@ static struct pm_qos_object cpu_dma_pm_qos = { .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = 2000 * USEC_PER_SEC, + .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), .comparitor = min_compare }; @@ -87,7 +87,7 @@ static struct pm_qos_object network_lat_pm_qos = { .notifiers = &network_lat_notifier, .name = "network_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = 2000 * USEC_PER_SEC, + .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), .comparitor = min_compare }; @@ -99,7 +99,7 @@ static struct pm_qos_object network_throughput_pm_qos = { .notifiers = &network_throughput_notifier, .name = "network_throughput", .default_value = 0, - .target_value = 0, + .target_value = ATOMIC_INIT(0), .comparitor = max_compare }; @@ -150,11 +150,11 @@ static void update_target(int target) extreme_value = pm_qos_array[target]->comparitor( extreme_value, node->value); } - if (pm_qos_array[target]->target_value != extreme_value) { + if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) { call_notifier = 1; - pm_qos_array[target]->target_value = extreme_value; + atomic_set(&pm_qos_array[target]->target_value, extreme_value); pr_debug(KERN_ERR "new target for qos %d is %d\n", target, - pm_qos_array[target]->target_value); + atomic_read(&pm_qos_array[target]->target_value)); } spin_unlock_irqrestore(&pm_qos_lock, flags); @@ -193,14 +193,7 @@ static int find_pm_qos_object_by_minor(int minor) */ int pm_qos_requirement(int pm_qos_class) { - int ret_val; - unsigned long flags; - - spin_lock_irqsave(&pm_qos_lock, flags); - ret_val = pm_qos_array[pm_qos_class]->target_value; - spin_unlock_irqrestore(&pm_qos_lock, flags); - - return ret_val; + return atomic_read(&pm_qos_array[pm_qos_class]->target_value); } EXPORT_SYMBOL_GPL(pm_qos_requirement); -- cgit v1.1 From b380b0d4f7dffcc235c0facefa537d4655619101 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Sep 2008 17:05:57 +0100 Subject: forgotten refcount on sysctl root table We should've set refcount on the root sysctl table; otherwise we'll blow up the first time we get down to zero dynamically registered sysctl tables. Signed-off-by: Al Viro Tested-by: James Bottomley Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fe47133..50ec088 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -159,6 +159,7 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file * static struct ctl_table root_table[]; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { + .count = 1, .ctl_table = root_table, .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), .root = &sysctl_table_root, -- cgit v1.1 From 268364a0f48aee2f851f9d1ef8a6cda0f3039ef1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 21:02:44 +0200 Subject: IO resources: add reserve_region_with_split() add reserve_region_with_split() to not lose e820 reserved entries if they overlap with existing IO regions: with test case by extend 0xe0000000 - 0xeffffff to 0xdd800000 - we get: e0000000-efffffff : PCI MMCONFIG 0 e0000000-efffffff : reserved and in /proc/iomem we get: found conflict for reserved [dd800000, efffffff], try to reserve with split __reserve_region_with_split: (PCI Bus #80) [dd000000, ddffffff], res: (reserved) [dd800000, efffffff] __reserve_region_with_split: (PCI Bus #00) [de000000, dfffffff], res: (reserved) [de000000, efffffff] initcall pci_subsys_init+0x0/0x121 returned 0 after 381 msecs in dmesg various fixes and improvements suggested by Linus. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/resource.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 03d796c..414d6fc 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -516,6 +516,74 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t return result; } +static void __init __reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + struct resource *parent = root; + struct resource *conflict; + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + + if (!res) + return; + + res->name = name; + res->start = start; + res->end = end; + res->flags = IORESOURCE_BUSY; + + for (;;) { + conflict = __request_resource(parent, res); + if (!conflict) + break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; + } + + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; + } + + if (!res) { + printk(KERN_DEBUG " __reserve_region_with_split: (%s) [%llx, %llx], res: (%s) [%llx, %llx]\n", + conflict->name, conflict->start, conflict->end, + name, start, end); + + /* failed, split and try again */ + + /* conflict coverred whole area */ + if (conflict->start <= start && conflict->end >= end) + return; + + if (conflict->start > start) + __reserve_region_with_split(root, start, conflict->start-1, name); + if (!(conflict->flags & IORESOURCE_BUSY)) { + resource_size_t common_start, common_end; + + common_start = max(conflict->start, start); + common_end = min(conflict->end, end); + if (common_start < common_end) + __reserve_region_with_split(root, common_start, common_end, name); + } + if (conflict->end < end) + __reserve_region_with_split(root, conflict->end+1, end, name); + } + +} + +void reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + write_lock(&resource_lock); + __reserve_region_with_split(root, start, end, name); + write_unlock(&resource_lock); +} + EXPORT_SYMBOL(adjust_resource); /** -- cgit v1.1 From 1cf44baad76b6f20f95ece397c6f643320aa44c9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 4 Sep 2008 21:26:06 +0200 Subject: IO resources: fix/remove printk Andrew Morton noticed that the printk in kernel/resource.c was buggy: | start and end have type resource_size_t. Such types CANNOT be printed | unless cast to a known type. | | Because there is a %s following an incorrect %lld, the above code will | crash the machine. ... and it's probably quite unneeded as well, so remove it. Reported-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/resource.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 414d6fc..fc59dcc 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -549,13 +549,9 @@ static void __init __reserve_region_with_split(struct resource *root, } if (!res) { - printk(KERN_DEBUG " __reserve_region_with_split: (%s) [%llx, %llx], res: (%s) [%llx, %llx]\n", - conflict->name, conflict->start, conflict->end, - name, start, end); - /* failed, split and try again */ - /* conflict coverred whole area */ + /* conflict covered whole area */ if (conflict->start <= start && conflict->end >= end) return; -- cgit v1.1 From 7c1e76897492d92b6a1c2d6892494d39ded9680c Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Wed, 3 Sep 2008 21:36:50 +0000 Subject: clockevents: prevent clockevent event_handler ending up handler_noop There is a ordering related problem with clockevents code, due to which clockevents_register_device() called after tickless/highres switch will not work. The new clockevent ends up with clockevents_handle_noop as event handler, resulting in no timer activity. The problematic path seems to be * old device already has hrtimer_interrupt as the event_handler * new clockevent device registers with a higher rating * tick_check_new_device() is called * clockevents_exchange_device() gets called * old->event_handler is set to clockevents_handle_noop * tick_setup_device() is called for the new device * which sets new->event_handler using the old->event_handler which is noop. Change the ordering so that new device inherits the proper handler. This does not have any issue in normal case as most likely all the clockevent devices are setup before the highres switch. But, can potentially be affecting some corner case where HPET force detect happens after the highres switch. This was a problem with HPET in MSI mode code that we have been experimenting with. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Shaohua Li Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/clockevents.c | 3 +-- kernel/time/tick-common.c | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 3d1e3e1..1876b52 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -177,7 +177,7 @@ void clockevents_register_device(struct clock_event_device *dev) /* * Noop handler when we shut down an event device */ -static void clockevents_handle_noop(struct clock_event_device *dev) +void clockevents_handle_noop(struct clock_event_device *dev) { } @@ -199,7 +199,6 @@ void clockevents_exchange_device(struct clock_event_device *old, * released list and do a notify add later. */ if (old) { - old->event_handler = clockevents_handle_noop; clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); list_del(&old->list); list_add(&old->list, &clockevents_released); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 80c4336..c477719 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -161,6 +161,7 @@ static void tick_setup_device(struct tick_device *td, } else { handler = td->evtdev->event_handler; next_event = td->evtdev->next_event; + td->evtdev->event_handler = clockevents_handle_noop; } td->evtdev = newdev; -- cgit v1.1 From d4496b39559c6d43f83e4c08b899984f8b8089b5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Sep 2008 21:36:57 +0000 Subject: clockevents: prevent endless loop in periodic broadcast handler The reprogramming of the periodic broadcast handler was broken, when the first programming returned -ETIME. The clockevents code stores the new expiry value in the clock events device next_event field only when the programming time has not been elapsed yet. The loop in question calculates the new expiry value from the next_event value and therefor never increases. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 31463d3..3044a88 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -175,6 +175,8 @@ static void tick_do_periodic_broadcast(void) */ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { + ktime_t next; + tick_do_periodic_broadcast(); /* @@ -185,10 +187,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) /* * Setup the next period for devices, which do not have - * periodic mode: + * periodic mode. We read dev->next_event first and add to it + * when the event alrady expired. clockevents_program_event() + * sets dev->next_event only when the event is really + * programmed to the device. */ - for (;;) { - ktime_t next = ktime_add(dev->next_event, tick_period); + for (next = dev->next_event; ;) { + next = ktime_add(next, tick_period); if (!clockevents_program_event(dev, next, ktime_get())) return; -- cgit v1.1 From 7205656ab48da29a95d7f55e43a81db755d3cb3a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Sep 2008 21:37:03 +0000 Subject: clockevents: enforce reprogram in oneshot setup In tick_oneshot_setup we program the device to the given next_event, but we do not check the return value. We need to make sure that the device is programmed enforced so the interrupt handler engine starts working. Split out the reprogramming function from tick_program_event() and call it with the device, which was handed in to tick_setup_oneshot(). Set the force argument, so the devices is firing an interrupt. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-oneshot.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 450c049..06595c6 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -23,11 +23,11 @@ #include "tick-internal.h" /** - * tick_program_event + * tick_program_event internal worker function */ -int tick_program_event(ktime_t expires, int force) +static int __tick_program_event(struct clock_event_device *dev, + ktime_t expires, int force) { - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; ktime_t now = ktime_get(); while (1) { @@ -41,6 +41,16 @@ int tick_program_event(ktime_t expires, int force) } /** + * tick_program_event + */ +int tick_program_event(ktime_t expires, int force) +{ + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + + return __tick_program_event(dev, expires, force); +} + +/** * tick_resume_onshot - resume oneshot mode */ void tick_resume_oneshot(void) @@ -61,7 +71,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, { newdev->event_handler = handler; clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); - clockevents_program_event(newdev, next_event, ktime_get()); + __tick_program_event(newdev, next_event, 1); } /** -- cgit v1.1 From 9c17bcda991000351cb2373f78be7e4b1c44caa3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Sep 2008 21:37:08 +0000 Subject: clockevents: prevent multiple init/shutdown While chasing the C1E/HPET bugreports I went through the clock events code inch by inch and found that the broadcast device can be initialized and shutdown multiple times. Multiple shutdowns are not critical, but useless waste of time. Multiple initializations are simply broken. Another CPU might have the device in use already after the first initialization and the second init could just render it unusable again. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 3044a88..5744f40 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -210,7 +210,7 @@ static void tick_do_broadcast_on_off(void *why) struct clock_event_device *bc, *dev; struct tick_device *td; unsigned long flags, *reason = why; - int cpu; + int cpu, bc_stopped; spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -228,6 +228,8 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_device_is_functional(dev)) goto out; + bc_stopped = cpus_empty(tick_broadcast_mask); + switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: @@ -250,9 +252,10 @@ static void tick_do_broadcast_on_off(void *why) break; } - if (cpus_empty(tick_broadcast_mask)) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); - else { + if (cpus_empty(tick_broadcast_mask)) { + if (!bc_stopped) + clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + } else if (bc_stopped) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else @@ -501,9 +504,12 @@ static void tick_broadcast_clear_oneshot(int cpu) */ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { - bc->event_handler = tick_handle_oneshot_broadcast; - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - bc->next_event.tv64 = KTIME_MAX; + /* Set it up only once ! */ + if (bc->event_handler != tick_handle_oneshot_broadcast) { + bc->event_handler = tick_handle_oneshot_broadcast; + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + bc->next_event.tv64 = KTIME_MAX; + } } /* -- cgit v1.1 From 1fb9b7d29d8e85ba3196eaa7ab871bf76fc98d36 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Sep 2008 21:37:14 +0000 Subject: clockevents: prevent endless loop lockup The C1E/HPET bug reports on AMDX2/RS690 systems where tracked down to a too small value of the HPET minumum delta for programming an event. The clockevents code needs to enforce an interrupt event on the clock event device in some cases. The enforcement code was stupid and naive, as it just added the minimum delta to the current time and tried to reprogram the device. When the minimum delta is too small, then this loops forever. Add a sanity check. Allow reprogramming to fail 3 times, then print a warning and double the minimum delta value to make sure, that this does not happen again. Use the same function for both tick-oneshot and tick-broadcast code. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 12 ++---------- kernel/time/tick-internal.h | 2 ++ kernel/time/tick-oneshot.c | 36 ++++++++++++++++++++++++++++++------ 3 files changed, 34 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 5744f40..2bc1f04 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -372,16 +372,8 @@ cpumask_t *tick_get_broadcast_oneshot_mask(void) static int tick_broadcast_set_event(ktime_t expires, int force) { struct clock_event_device *bc = tick_broadcast_device.evtdev; - ktime_t now = ktime_get(); - int res; - - for(;;) { - res = clockevents_program_event(bc, expires, now); - if (!res || !force) - return res; - now = ktime_get(); - expires = ktime_add(now, ktime_set(0, bc->min_delta_ns)); - } + + return tick_dev_program_event(bc, expires, force); } int tick_resume_broadcast_oneshot(struct clock_event_device *bc) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f13f2b7..0ffc291 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -17,6 +17,8 @@ extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), ktime_t nextevt); +extern int tick_dev_program_event(struct clock_event_device *dev, + ktime_t expires, int force); extern int tick_program_event(ktime_t expires, int force); extern void tick_oneshot_notify(void); extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 06595c6..2e35501 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -25,18 +25,42 @@ /** * tick_program_event internal worker function */ -static int __tick_program_event(struct clock_event_device *dev, - ktime_t expires, int force) +int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, + int force) { ktime_t now = ktime_get(); + int i; - while (1) { + for (i = 0;;) { int ret = clockevents_program_event(dev, expires, now); if (!ret || !force) return ret; + + /* + * We tried 2 times to program the device with the given + * min_delta_ns. If that's not working then we double it + * and emit a warning. + */ + if (++i > 2) { + printk(KERN_WARNING "CE: __tick_program_event of %s is " + "stuck %llx %llx\n", dev->name ? dev->name : "?", + now.tv64, expires.tv64); + printk(KERN_WARNING + "CE: increasing min_delta_ns %ld to %ld nsec\n", + dev->min_delta_ns, dev->min_delta_ns << 1); + WARN_ON(1); + + /* Double the min. delta and try again */ + if (!dev->min_delta_ns) + dev->min_delta_ns = 5000; + else + dev->min_delta_ns <<= 1; + i = 0; + } + now = ktime_get(); - expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); + expires = ktime_add_ns(now, dev->min_delta_ns); } } @@ -47,7 +71,7 @@ int tick_program_event(ktime_t expires, int force) { struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; - return __tick_program_event(dev, expires, force); + return tick_dev_program_event(dev, expires, force); } /** @@ -71,7 +95,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, { newdev->event_handler = handler; clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); - __tick_program_event(newdev, next_event, 1); + tick_dev_program_event(newdev, next_event, 1); } /** -- cgit v1.1 From 56c7426b3951e4f35a71d695f1c982989399d6fd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 1 Sep 2008 16:44:23 +0200 Subject: sched_clock: fix NOHZ interaction If HLT stops the TSC, we'll fail to account idle time, thereby inflating the actual process times. Fix this by re-calibrating the clock against GTOD when leaving nohz mode. Signed-off-by: Peter Zijlstra Tested-by: Avi Kivity Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7a46bde..a87b046 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -162,6 +162,8 @@ void tick_nohz_stop_idle(int cpu) ts->idle_lastupdate = now; ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ts->idle_active = 0; + + sched_clock_idle_wakeup_event(0); } } @@ -177,6 +179,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts) } ts->idle_entrytime = now; ts->idle_active = 1; + sched_clock_idle_sleep_event(); return now; } -- cgit v1.1 From 49048622eae698e5c4ae61f7e71200f265ccc529 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 5 Sep 2008 18:12:23 +0200 Subject: sched: fix process time monotonicity Spencer reported a problem where utime and stime were going negative despite the fixes in commit b27f03d4bdc145a09fb7b0c0e004b29f1ee555fa. The suspected reason for the problem is that signal_struct maintains it's own utime and stime (of exited tasks), these are not updated using the new task_utime() routine, hence sig->utime can go backwards and cause the same problem to occur (sig->utime, adds tsk->utime and not task_utime()). This patch fixes the problem TODO: using max(task->prev_utime, derived utime) works for now, but a more generic solution is to implement cputime_max() and use the cputime_gt() function for comparison. Reported-by: spencer@bluehost.com Signed-off-by: Balbir Singh Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/exit.c | 6 +++--- kernel/sched.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 25ed2ad..1639564 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -112,9 +112,9 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, tsk->utime); - sig->stime = cputime_add(sig->stime, tsk->stime); - sig->gtime = cputime_add(sig->gtime, tsk->gtime); + sig->utime = cputime_add(sig->utime, task_utime(tsk)); + sig->stime = cputime_add(sig->stime, task_stime(tsk)); + sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; diff --git a/kernel/sched.c b/kernel/sched.c index 9a1ddb8..1a5f73c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4179,6 +4179,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal) } /* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +cputime_t task_utime(struct task_struct *p) +{ + return p->utime; +} + +cputime_t task_stime(struct task_struct *p) +{ + return p->stime; +} +#else +cputime_t task_utime(struct task_struct *p) +{ + clock_t utime = cputime_to_clock_t(p->utime), + total = utime + cputime_to_clock_t(p->stime); + u64 temp; + + /* + * Use CFS's precise accounting: + */ + temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + + if (total) { + temp *= utime; + do_div(temp, total); + } + utime = (clock_t)temp; + + p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); + return p->prev_utime; +} + +cputime_t task_stime(struct task_struct *p) +{ + clock_t stime; + + /* + * Use CFS's precise accounting. (we subtract utime from + * the total, to make sure the total observed by userspace + * grows monotonically - apps rely on that): + */ + stime = nsec_to_clock_t(p->se.sum_exec_runtime) - + cputime_to_clock_t(task_utime(p)); + + if (stime >= 0) + p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + + return p->prev_stime; +} +#endif + +inline cputime_t task_gtime(struct task_struct *p) +{ + return p->gtime; +} + +/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. * -- cgit v1.1 From 7300711e8c6824fcfbd42a126980ff50439d8dd0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Sep 2008 03:01:45 +0200 Subject: clockevents: broadcast fixup possible waiters Until the C1E patches arrived there where no users of periodic broadcast before switching to oneshot mode. Now we need to trigger a possible waiter for a periodic broadcast when switching to oneshot mode. Otherwise we can starve them for ever. Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2bc1f04..2f5a382 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -491,6 +491,18 @@ static void tick_broadcast_clear_oneshot(int cpu) cpu_clear(cpu, tick_broadcast_oneshot_mask); } +static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires) +{ + struct tick_device *td; + int cpu; + + for_each_cpu_mask_nr(cpu, *mask) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev) + td->evtdev->next_event = expires; + } +} + /** * tick_broadcast_setup_oneshot - setup the broadcast device */ @@ -498,9 +510,32 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { /* Set it up only once ! */ if (bc->event_handler != tick_handle_oneshot_broadcast) { + int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; + int cpu = smp_processor_id(); + cpumask_t mask; + bc->event_handler = tick_handle_oneshot_broadcast; clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - bc->next_event.tv64 = KTIME_MAX; + + /* Take the do_timer update */ + tick_do_timer_cpu = cpu; + + /* + * We must be careful here. There might be other CPUs + * waiting for periodic broadcast. We need to set the + * oneshot_mask bits for those and program the + * broadcast device to fire. + */ + mask = tick_broadcast_mask; + cpu_clear(cpu, mask); + cpus_or(tick_broadcast_oneshot_mask, + tick_broadcast_oneshot_mask, mask); + + if (was_periodic && !cpus_empty(mask)) { + tick_broadcast_init_next_event(&mask, tick_next_period); + tick_broadcast_set_event(tick_next_period, 1); + } else + bc->next_event.tv64 = KTIME_MAX; } } -- cgit v1.1 From c8bfff6dd4d41834f4952cbc49e28e31906a6188 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Fri, 5 Sep 2008 23:46:19 +0200 Subject: sched: compilation fix with gcc 3.4.6 I found that 2.6.27-rc5-mm1 does not compile with gcc 3.4.6. The error is: CC kernel/sched.o kernel/sched.c: In function `start_rt_bandwidth': kernel/sched.c:208: sorry, unimplemented: inlining failed in call to 'rt_bandwidth_enabled': function body not available kernel/sched.c:214: sorry, unimplemented: called from here make[1]: *** [kernel/sched.o] Error 1 make: *** [kernel] Error 2 It seems that the gcc 3.4.6 requires full inline definition before first usage. The patch below fixes the compilation problem. Signed-off-by: Krzysztof Helt (if needed> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 703f56d..4de2bfb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -204,7 +204,10 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; } -static inline int rt_bandwidth_enabled(void); +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { @@ -841,11 +844,6 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } -static inline int rt_bandwidth_enabled(void) -{ - return sysctl_sched_rt_runtime >= 0; -} - #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif -- cgit v1.1 From 4ff4b9e19a80b73959ebeb28d1df40176686f0a8 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Fri, 5 Sep 2008 14:05:31 -0700 Subject: ntp: fix calculation of the next jiffie to trigger RTC sync We have a bug in the calculation of the next jiffie to trigger the RTC synchronisation. The aim here is to run sync_cmos_clock() as close as possible to the middle of a second. Which means we want this function to be called less than or equal to half a jiffie away from when now.tv_nsec equals 5e8 (500000000). If this is not the case for a given call to the function, for this purpose instead of updating the RTC we calculate the offset in nanoseconds to the next point in time where now.tv_nsec will be equal 5e8. The calculated offset is then converted to jiffies as these are the unit used by the timer. Hovewer timespec_to_jiffies() used here uses a ceil()-type rounding mode, where the resulting value is rounded up. As a result the range of now.tv_nsec when the timer will trigger is from 5e8 to 5e8 + TICK_NSEC rather than the desired 5e8 - TICK_NSEC / 2 to 5e8 + TICK_NSEC / 2. As a result if for example sync_cmos_clock() happens to be called at the time when now.tv_nsec is between 5e8 + TICK_NSEC / 2 and 5e8 to 5e8 + TICK_NSEC, it will simply be rescheduled HZ jiffies later, falling in the same range of now.tv_nsec again. Similarly for cases offsetted by an integer multiple of TICK_NSEC. This change addresses the problem by subtracting TICK_NSEC / 2 from the nanosecond offset to the next point in time where now.tv_nsec will be equal 5e8, effectively shifting the following rounding in timespec_to_jiffies() so that it produces a rounded-to-nearest result. Signed-off-by: Maciej W. Rozycki Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5125ddd..1ad46f3 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -245,7 +245,7 @@ static void sync_cmos_clock(unsigned long dummy) if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) fail = update_persistent_clock(now); - next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; + next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); if (next.tv_nsec <= 0) next.tv_nsec += NSEC_PER_SEC; -- cgit v1.1 From 38736f475071b80b66be28af7b44c854073699cc Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Sat, 6 Sep 2008 14:50:23 +0530 Subject: sched: fix __load_balance_iterator() for cfq with only one task The __load_balance_iterator() returns a NULL when there's only one sched_entity which is a task. It is caused by the following code-path. /* Skip over entities that are not tasks */ do { se = list_entry(next, struct sched_entity, group_node); next = next->next; } while (next != &cfs_rq->tasks && !entity_is_task(se)); if (next == &cfs_rq->tasks) return NULL; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This will return NULL even when se is a task. As a side-effect, there was a regression in sched_mc behavior since 2.6.25, since iter_move_one_task() when it calls load_balance_start_fair(), would not get any tasks to move! Fix this by checking if the last entity was a task or not. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 8264bb5..a10ac0b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1458,7 +1458,7 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) next = next->next; } while (next != &cfs_rq->tasks && !entity_is_task(se)); - if (next == &cfs_rq->tasks) + if (next == &cfs_rq->tasks && !entity_is_task(se)) return NULL; cfs_rq->balance_iterator = next; -- cgit v1.1 From 3ba35573ad9a149a3af19625b502679283382f6b Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Sun, 31 Aug 2008 19:58:49 +0200 Subject: kernel/cpu.c: Move the CPU_DYING notifiers When a cpu is taken offline, the CPU_DYING notifiers are called on the dying cpu. According to , the cpu should be "not running any task, not handling interrupts, soon dead". For the current implementation, this is not true: - __cpu_disable can fail. If it fails, then the cpu will remain alive and happy. - At least on x86, __cpu_disable() briefly enables the local interrupts to handle any outstanding interrupts. What about moving CPU_DYING down a few lines, behind the __cpu_disable() line? There are only two CPU_DYING handlers in the kernel right now: one in kvm, one in the scheduler. Both should work with the patch applied [and: I'm not sure if either one handles a failing __cpu_disable()] The patch survives simple offlining a cpu. kvm untested due to lack of a test setup. Signed-off-By: Manfred Spraul Signed-off-by: Ingo Molnar --- kernel/cpu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index f17e985..9e7ebde 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param) struct take_cpu_down_param *param = _param; int err; - raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, - param->hcpu); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; + raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, + param->hcpu); + /* Force idle task to run as soon as we yield: it should immediately notice cpu is offline and die quickly. */ sched_idle_next(); -- cgit v1.1 From dfb512ec4834116124da61d6c1ee10fd0aa32bd6 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Fri, 29 Aug 2008 13:11:41 -0700 Subject: sched: arch_reinit_sched_domains() must destroy domains to force rebuild What I realized recently is that calling rebuild_sched_domains() in arch_reinit_sched_domains() by itself is not enough when cpusets are enabled. partition_sched_domains() code is trying to avoid unnecessary domain rebuilds and will not actually rebuild anything if new domain masks match the old ones. What this means is that doing echo 1 > /sys/devices/system/cpu/sched_mc_power_savings on a system with cpusets enabled will not take affect untill something changes in the cpuset setup (ie new sets created or deleted). This patch fixes restore correct behaviour where domains must be rebuilt in order to enable MC powersaving flags. Test on quad-core Core2 box with both CONFIG_CPUSETS and !CONFIG_CPUSETS. Also tested on dual-core Core2 laptop. Lockdep is happy and things are working as expected. Signed-off-by: Max Krasnyansky Tested-by: Vaidyanathan Srinivasan Signed-off-by: Ingo Molnar --- kernel/sched.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d601fb0..d72ee9a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7589,24 +7589,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * and partition_sched_domains() will fallback to the single partition * 'fallback_doms', it also forces the domains to be rebuilt. * + * If doms_new==NULL it will be replaced with cpu_online_map. + * ndoms_new==0 is a special case for destroying existing domains. + * It will not create the default domain. + * * Call with hotplug lock held */ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, struct sched_domain_attr *dattr_new) { - int i, j; + int i, j, n; mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); - if (doms_new == NULL) - ndoms_new = 0; + n = doms_new ? ndoms_new : 0; /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < ndoms_new; j++) { + for (j = 0; j < n; j++) { if (cpus_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; @@ -7619,7 +7622,6 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; - ndoms_new = 1; doms_new = &fallback_doms; cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); dattr_new = NULL; @@ -7656,8 +7658,13 @@ match2: int arch_reinit_sched_domains(void) { get_online_cpus(); + + /* Destroy domains first to force the rebuild */ + partition_sched_domains(0, NULL, NULL); + rebuild_sched_domains(); put_online_cpus(); + return 0; } @@ -7741,7 +7748,7 @@ static int update_sched_domains(struct notifier_block *nfb, case CPU_ONLINE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - partition_sched_domains(0, NULL, NULL); + partition_sched_domains(1, NULL, NULL); return NOTIFY_OK; default: -- cgit v1.1 From 978b0116cd225682a29e3d1d5010319bf2de32c2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 6 Sep 2008 20:04:36 +0200 Subject: softirq: allocate less vectors We don't need whole 32 of them, only NR_SOFTIRQS. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index c506f26..82e32aa 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -46,7 +46,7 @@ irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; EXPORT_SYMBOL(irq_stat); #endif -static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; +static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); -- cgit v1.1 From e545a6140b698b2494daf0b32107bdcc5e901390 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Sun, 7 Sep 2008 16:57:22 +0200 Subject: kernel/cpu.c: create a CPU_STARTING cpu_chain notifier Right now, there is no notifier that is called on a new cpu, before the new cpu begins processing interrupts/softirqs. Various kernel function would need that notification, e.g. kvm works around by calling smp_call_function_single(), rcu polls cpu_online_map. The patch adds a CPU_STARTING notification. It also adds a helper function that sends the message to all cpu_chain handlers. Tested on x86-64. All other archs are untested. Especially on sparc, I'm not sure if I got it right. Signed-off-by: Manfred Spraul Signed-off-by: Ingo Molnar --- kernel/cpu.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index f17e985..dc45f24 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -453,6 +453,25 @@ out: } #endif /* CONFIG_PM_SLEEP_SMP */ +/** + * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * @cpu: cpu that just started + * + * This function calls the cpu_chain notifiers with CPU_STARTING. + * It must be called by the arch code on the new cpu, before the new cpu + * enables interrupts and before the "boot" cpu returns from __cpu_up(). + */ +void notify_cpu_starting(unsigned int cpu) +{ + unsigned long val = CPU_STARTING; + +#ifdef CONFIG_PM_SLEEP_SMP + if (cpu_isset(cpu, frozen_cpus)) + val = CPU_STARTING_FROZEN; +#endif /* CONFIG_PM_SLEEP_SMP */ + raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); +} + #endif /* CONFIG_SMP */ /* -- cgit v1.1 From 61c22c34c6f80a8e89cff5ff717627c54cc14fd4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Sep 2008 21:38:57 +0200 Subject: clockevents: remove WARN_ON which was used to gather information The issue of the endless reprogramming loop due to a too small min_delta_ns was fixed with the previous updates of the clock events code, but we had no information about the spread of this problem. I added a WARN_ON to get automated information via kerneloops.org and to get some direct reports, which allowed me to analyse the affected machines. The WARN_ON has served its purpose and would be annoying for a release kernel. Remove it and just keep the information about the increase of the min_delta_ns value. Signed-off-by: Thomas Gleixner --- kernel/time/tick-oneshot.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 2e35501..2e8de67 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -43,19 +43,17 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, * and emit a warning. */ if (++i > 2) { - printk(KERN_WARNING "CE: __tick_program_event of %s is " - "stuck %llx %llx\n", dev->name ? dev->name : "?", - now.tv64, expires.tv64); - printk(KERN_WARNING - "CE: increasing min_delta_ns %ld to %ld nsec\n", - dev->min_delta_ns, dev->min_delta_ns << 1); - WARN_ON(1); - - /* Double the min. delta and try again */ + /* Increase the min. delta and try again */ if (!dev->min_delta_ns) dev->min_delta_ns = 5000; else - dev->min_delta_ns <<= 1; + dev->min_delta_ns += dev->min_delta_ns >> 1; + + printk(KERN_WARNING + "CE: %s increasing min_delta_ns to %lu nsec\n", + dev->name ? dev->name : "?", + dev->min_delta_ns << 1); + i = 0; } -- cgit v1.1 From baf25731e54d06eb13dc4eda78c6dc7da4ce84e8 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Tue, 9 Sep 2008 11:26:33 +0800 Subject: sched: fix 2.6.27-rc5 couldn't boot on tulsa machine randomly On my tulsa x86-64 machine, kernel 2.6.25-rc5 couldn't boot randomly. Basically, function __enable_runtime forgets to reset rt_rq->rt_throttled to 0. When every cpu is up, per-cpu migration_thread is created and it runs very fast, sometimes to mark the corresponding rt_rq->rt_throttled to 1 very quickly. After all cpus are up, with below calling chain: sched_init_smp => arch_init_sched_domains => build_sched_domains => ... => cpu_attach_domain => rq_attach_root => set_rq_online => ... => _enable_runtime _enable_runtime is called against every rt_rq again, so rt_rq->rt_time is reset to 0, but rt_rq->rt_throttled might be still 1. Later on function do_sched_rt_period_timer couldn't reset it, and all RT tasks couldn't be scheduled to run on that cpu. here is RT task migration_thread which is woken up when a task is migrated to another cpu. Below patch fixes it against 2.6.27-rc5. Signed-off-by: Zhang Yanmin Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 5523107..1113157 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -350,6 +350,7 @@ static void __enable_runtime(struct rq *rq) spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = rt_b->rt_runtime; rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); } -- cgit v1.1 From ec5d498991e87c74730509508b25c3959192b7e7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 10 Sep 2008 17:00:19 -0700 Subject: sched: fix deadlock in setting scheduler parameter to zero Andrei Gusev wrote: > I played witch scheduler settings. After doing something like: > echo -n 1000000 >sched_rt_period_us > > command is locked. I found in kernel.log: > > Sep 11 00:39:34 zaratustra > Sep 11 00:39:34 zaratustra Pid: 4495, comm: bash Tainted: G W > (2.6.26.3 #12) > Sep 11 00:39:34 zaratustra EIP: 0060:[] EFLAGS: 00210246 CPU: 0 > Sep 11 00:39:34 zaratustra EIP is at div64_u64+0x57/0x80 > Sep 11 00:39:34 zaratustra EAX: 0000389f EBX: 00000000 ECX: 00000000 > EDX: 00000000 > Sep 11 00:39:34 zaratustra ESI: d9800000 EDI: d9800000 EBP: 0000389f > ESP: ea7a6edc > Sep 11 00:39:34 zaratustra DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068 > Sep 11 00:39:34 zaratustra Process bash (pid: 4495, ti=ea7a6000 > task=ea744000 task.ti=ea7a6000) > Sep 11 00:39:34 zaratustra Stack: 00000000 000003e8 d9800000 0000389f > c0119042 00000000 00000000 00000001 > Sep 11 00:39:34 zaratustra 00000000 00000000 ea7a6f54 00010000 00000000 > c04d2e80 00000001 000e7ef0 > Sep 11 00:39:34 zaratustra c01191a3 00000000 00000000 ea7a6fa0 00000001 > ffffffff c04d2e80 ea5b2480 > Sep 11 00:39:34 zaratustra Call Trace: > Sep 11 00:39:34 zaratustra [] __rt_schedulable+0x52/0x130 > Sep 11 00:39:34 zaratustra [] sched_rt_handler+0x83/0x120 > Sep 11 00:39:34 zaratustra [] proc_sys_call_handler+0xb6/0xd0 > Sep 11 00:39:34 zaratustra [] proc_sys_write+0x0/0x20 > Sep 11 00:39:34 zaratustra [] proc_sys_write+0x19/0x20 > Sep 11 00:39:34 zaratustra [] vfs_write+0xa8/0x140 > Sep 11 00:39:34 zaratustra [] sys_write+0x41/0x80 > Sep 11 00:39:34 zaratustra [] sysenter_past_esp+0x6a/0x91 > Sep 11 00:39:34 zaratustra ======================= > Sep 11 00:39:34 zaratustra Code: c8 41 0f ad f3 d3 ee f6 c1 20 0f 45 de > 31 f6 0f ad ef d3 ed f6 c1 20 0f 45 fd 0f 45 ee 31 c9 39 eb 89 fe 89 ea > 77 08 89 e8 31 d2 f3 89 c1 89 f0 8b 7c 24 08 f7 f3 8b 74 24 04 89 > ca 8b 1c 24 > Sep 11 00:39:34 zaratustra EIP: [] div64_u64+0x57/0x80 SS:ESP > 0068:ea7a6edc > Sep 11 00:39:34 zaratustra ---[ end trace 4eaa2a86a8e2da22 ]--- fix the boundary condition. sysctl_sched_rt_period=0 makes exception at to_ratio(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index cc1f81b..9889080 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8909,6 +8909,9 @@ static int sched_rt_global_constraints(void) u64 rt_runtime, rt_period; int ret = 0; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); rt_runtime = tg->rt_bandwidth.rt_runtime; @@ -8925,6 +8928,9 @@ static int sched_rt_global_constraints(void) unsigned long flags; int i; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; -- cgit v1.1 From 72c57ed50663dc04b0b329beaec39b557c8ac5a5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 11 Sep 2008 23:29:54 -0700 Subject: sysctl: Use CONFIG_SPARC instead of __sparc__ for ifdef tests. Signed-off-by: David S. Miller --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fe47133..eda6162 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -118,7 +118,7 @@ extern char modprobe_path[]; extern int sg_big_buff; #endif -#ifdef __sparc__ +#ifdef CONFIG_SPARC extern char reboot_command []; extern int stop_a_enabled; extern int scons_pwroff; @@ -414,7 +414,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#ifdef __sparc__ +#ifdef CONFIG_SPARC { .ctl_name = KERN_SPARC_REBOOT, .procname = "reboot-cmd", -- cgit v1.1 From 17f04fbb0f7153d95ec33da81189b113cc778157 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 11 Sep 2008 23:33:53 -0700 Subject: sysctl: Use header file for sysctl knob declarations on sparc. This also takes care of a sparse warning as scons_pwroff's definition point. Signed-off-by: David S. Miller --- kernel/sysctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index eda6162..da51520 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -119,9 +119,7 @@ extern int sg_big_buff; #endif #ifdef CONFIG_SPARC -extern char reboot_command []; -extern int stop_a_enabled; -extern int scons_pwroff; +#include #endif #ifdef __hppa__ -- cgit v1.1 From 4e74339af6a59c061cf02f1ac497766bca1de19a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 13 Sep 2008 02:33:08 -0700 Subject: cpuset: avoid changing cpuset's cpus when -errno returned After the patch: commit 0b2f630a28d53b5a2082a5275bc3334b10373508 Author: Miao Xie Date: Fri Jul 25 01:47:21 2008 -0700 cpusets: restructure the function update_cpumask() and update_nodemask() It might happen that 'echo 0 > /cpuset/sub/cpus' returned failure but 'cpus' has been changed, because cpus was changed before calling heap_init() which may return -ENOMEM. This patch restores the orginal behavior. Signed-off-by: Li Zefan Acked-by: Paul Menage Cc: Paul Jackson Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f227bc1..827cd9ad 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -843,37 +843,25 @@ static void cpuset_change_cpumask(struct task_struct *tsk, /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed + * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * * Called with cgroup_mutex held * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. * - * Return 0 if successful, -errno if not. + * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * if @heap != NULL. */ -static int update_tasks_cpumask(struct cpuset *cs) +static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) { struct cgroup_scanner scan; - struct ptr_heap heap; - int retval; - - /* - * cgroup_scan_tasks() will initialize heap->gt for us. - * heap_init() is still needed here for we should not change - * cs->cpus_allowed when heap_init() fails. - */ - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (retval) - return retval; scan.cg = cs->css.cgroup; scan.test_task = cpuset_test_cpumask; scan.process_task = cpuset_change_cpumask; - scan.heap = &heap; - retval = cgroup_scan_tasks(&scan); - - heap_free(&heap); - return retval; + scan.heap = heap; + cgroup_scan_tasks(&scan); } /** @@ -883,6 +871,7 @@ static int update_tasks_cpumask(struct cpuset *cs) */ static int update_cpumask(struct cpuset *cs, const char *buf) { + struct ptr_heap heap; struct cpuset trialcs; int retval; int is_load_balanced; @@ -917,6 +906,10 @@ static int update_cpumask(struct cpuset *cs, const char *buf) if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) return 0; + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); + if (retval) + return retval; + is_load_balanced = is_sched_load_balance(&trialcs); mutex_lock(&callback_mutex); @@ -927,9 +920,9 @@ static int update_cpumask(struct cpuset *cs, const char *buf) * Scan tasks in the cpuset, and update the cpumasks of any * that need an update. */ - retval = update_tasks_cpumask(cs); - if (retval < 0) - return retval; + update_tasks_cpumask(cs, &heap); + + heap_free(&heap); if (is_load_balanced) async_rebuild_sched_domains(); @@ -1965,7 +1958,7 @@ static void scan_for_empty_cpusets(const struct cpuset *root) nodes_empty(cp->mems_allowed)) remove_tasks_in_empty_cpuset(cp); else { - update_tasks_cpumask(cp); + update_tasks_cpumask(cp, NULL); update_tasks_nodemask(cp, &oldmems); } } -- cgit v1.1 From f06febc96ba8e0af80bcc3eaec0a109e88275fac Mon Sep 17 00:00:00 2001 From: Frank Mayhar Date: Fri, 12 Sep 2008 09:54:39 -0700 Subject: timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar Cc: Roland McGrath Cc: Alexey Dobriyan Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/compat.c | 53 ++---- kernel/exit.c | 19 +- kernel/fork.c | 88 +++++---- kernel/itimer.c | 33 +--- kernel/posix-cpu-timers.c | 471 +++++++++++++++++++++++++--------------------- kernel/sched.c | 53 +++++- kernel/sched_fair.c | 1 + kernel/sched_rt.c | 4 +- kernel/signal.c | 8 +- kernel/sys.c | 75 +++----- 10 files changed, 415 insertions(+), 390 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 32c254a..72650e3 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -150,49 +151,23 @@ asmlinkage long compat_sys_setitimer(int which, return 0; } +static compat_clock_t clock_t_to_compat_clock_t(clock_t x) +{ + return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); +} + asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) { - /* - * In the SMP world we might just be unlucky and have one of - * the times increment as we use it. Since the value is an - * atomically safe type this is just fine. Conceptually its - * as if the syscall took an instant longer to occur. - */ if (tbuf) { + struct tms tms; struct compat_tms tmp; - struct task_struct *tsk = current; - struct task_struct *t; - cputime_t utime, stime, cutime, cstime; - - read_lock(&tasklist_lock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); - - /* - * While we have tasklist_lock read-locked, no dying thread - * can be updating current->signal->[us]time. Instead, - * we got their counts included in the live thread loop. - * However, another thread can come in right now and - * do a wait call that updates current->signal->c[us]time. - * To make sure we always see that pair updated atomically, - * we take the siglock around fetching them. - */ - spin_lock_irq(&tsk->sighand->siglock); - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); - - tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime)); - tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime)); - tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime)); - tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime)); + + do_sys_times(&tms); + /* Convert our struct tms to the compat version. */ + tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); + tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); + tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); + tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); if (copy_to_user(tbuf, &tmp, sizeof(tmp))) return -EFAULT; } diff --git a/kernel/exit.c b/kernel/exit.c index 1639564..40036ac 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -112,8 +112,6 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, task_utime(tsk)); - sig->stime = cputime_add(sig->stime, task_stime(tsk)); sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; @@ -122,7 +120,6 @@ static void __exit_signal(struct task_struct *tsk) sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -1294,6 +1291,7 @@ static int wait_task_zombie(struct task_struct *p, int options, if (likely(!traced)) { struct signal_struct *psig; struct signal_struct *sig; + struct task_cputime cputime; /* * The resource counters for the group leader are in its @@ -1309,20 +1307,23 @@ static int wait_task_zombie(struct task_struct *p, int options, * need to protect the access to p->parent->signal fields, * as other threads in the parent group can be right * here reaping other children at the same time. + * + * We use thread_group_cputime() to get times for the thread + * group, which consolidates times for all threads in the + * group including the group leader. */ spin_lock_irq(&p->parent->sighand->siglock); psig = p->parent->signal; sig = p->signal; + thread_group_cputime(p, &cputime); psig->cutime = cputime_add(psig->cutime, - cputime_add(p->utime, - cputime_add(sig->utime, - sig->cutime))); + cputime_add(cputime.utime, + sig->cutime)); psig->cstime = cputime_add(psig->cstime, - cputime_add(p->stime, - cputime_add(sig->stime, - sig->cstime))); + cputime_add(cputime.stime, + sig->cstime)); psig->cgtime = cputime_add(psig->cgtime, cputime_add(p->gtime, diff --git a/kernel/fork.c b/kernel/fork.c index 7ce2ebe..a8ac2ef 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -759,15 +759,44 @@ void __cleanup_sighand(struct sighand_struct *sighand) kmem_cache_free(sighand_cachep, sighand); } + +/* + * Initialize POSIX timer handling for a thread group. + */ +static void posix_cpu_timers_init_group(struct signal_struct *sig) +{ + /* Thread group counters. */ + thread_group_cputime_init(sig); + + /* Expiration times and increments. */ + sig->it_virt_expires = cputime_zero; + sig->it_virt_incr = cputime_zero; + sig->it_prof_expires = cputime_zero; + sig->it_prof_incr = cputime_zero; + + /* Cached expiration times. */ + sig->cputime_expires.prof_exp = cputime_zero; + sig->cputime_expires.virt_exp = cputime_zero; + sig->cputime_expires.sched_exp = 0; + + /* The timer lists. */ + INIT_LIST_HEAD(&sig->cpu_timers[0]); + INIT_LIST_HEAD(&sig->cpu_timers[1]); + INIT_LIST_HEAD(&sig->cpu_timers[2]); +} + static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { struct signal_struct *sig; int ret; if (clone_flags & CLONE_THREAD) { - atomic_inc(¤t->signal->count); - atomic_inc(¤t->signal->live); - return 0; + ret = thread_group_cputime_clone_thread(current, tsk); + if (likely(!ret)) { + atomic_inc(¤t->signal->count); + atomic_inc(¤t->signal->live); + } + return ret; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; @@ -795,15 +824,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; - sig->it_virt_expires = cputime_zero; - sig->it_virt_incr = cputime_zero; - sig->it_prof_expires = cputime_zero; - sig->it_prof_incr = cputime_zero; - sig->leader = 0; /* session leadership doesn't inherit */ sig->tty_old_pgrp = NULL; - sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; + sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; @@ -820,14 +844,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); - if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { - /* - * New sole thread in the process gets an expiry time - * of the whole CPU time limit. - */ - tsk->it_prof_expires = - secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); - } + posix_cpu_timers_init_group(sig); + acct_init_pacct(&sig->pacct); tty_audit_fork(sig); @@ -837,6 +855,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_signal(struct signal_struct *sig) { + thread_group_cputime_free(sig); exit_thread_group_keys(sig); kmem_cache_free(signal_cachep, sig); } @@ -886,6 +905,19 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif /* CONFIG_MM_OWNER */ /* + * Initialize POSIX timer handling for a single task. + */ +static void posix_cpu_timers_init(struct task_struct *tsk) +{ + tsk->cputime_expires.prof_exp = cputime_zero; + tsk->cputime_expires.virt_exp = cputime_zero; + tsk->cputime_expires.sched_exp = 0; + INIT_LIST_HEAD(&tsk->cpu_timers[0]); + INIT_LIST_HEAD(&tsk->cpu_timers[1]); + INIT_LIST_HEAD(&tsk->cpu_timers[2]); +} + +/* * This creates a new process as a copy of the old one, * but does not actually start it yet. * @@ -995,12 +1027,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, task_io_accounting_init(&p->ioac); acct_clear_integrals(p); - p->it_virt_expires = cputime_zero; - p->it_prof_expires = cputime_zero; - p->it_sched_expires = 0; - INIT_LIST_HEAD(&p->cpu_timers[0]); - INIT_LIST_HEAD(&p->cpu_timers[1]); - INIT_LIST_HEAD(&p->cpu_timers[2]); + posix_cpu_timers_init(p); p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); @@ -1201,21 +1228,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) { p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); - - if (!cputime_eq(current->signal->it_virt_expires, - cputime_zero) || - !cputime_eq(current->signal->it_prof_expires, - cputime_zero) || - current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || - !list_empty(¤t->signal->cpu_timers[0]) || - !list_empty(¤t->signal->cpu_timers[1]) || - !list_empty(¤t->signal->cpu_timers[2])) { - /* - * Have child wake up on its first tick to check - * for process CPU timers. - */ - p->it_prof_expires = jiffies_to_cputime(1); - } } if (likely(p->pid)) { diff --git a/kernel/itimer.c b/kernel/itimer.c index ab98274..db7c358 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -55,17 +55,15 @@ int do_getitimer(int which, struct itimerval *value) spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_virt_expires; cinterval = tsk->signal->it_virt_incr; if (!cputime_eq(cval, cputime_zero)) { - struct task_struct *t = tsk; - cputime_t utime = tsk->signal->utime; - do { - utime = cputime_add(utime, t->utime); - t = next_thread(t); - } while (t != tsk); + struct task_cputime cputime; + cputime_t utime; + + thread_group_cputime(tsk, &cputime); + utime = cputime.utime; if (cputime_le(cval, utime)) { /* about to fire */ cval = jiffies_to_cputime(1); } else { @@ -73,25 +71,19 @@ int do_getitimer(int which, struct itimerval *value) } } spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); cputime_to_timeval(cval, &value->it_value); cputime_to_timeval(cinterval, &value->it_interval); break; case ITIMER_PROF: - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_prof_expires; cinterval = tsk->signal->it_prof_incr; if (!cputime_eq(cval, cputime_zero)) { - struct task_struct *t = tsk; - cputime_t ptime = cputime_add(tsk->signal->utime, - tsk->signal->stime); - do { - ptime = cputime_add(ptime, - cputime_add(t->utime, - t->stime)); - t = next_thread(t); - } while (t != tsk); + struct task_cputime times; + cputime_t ptime; + + thread_group_cputime(tsk, ×); + ptime = cputime_add(times.utime, times.stime); if (cputime_le(cval, ptime)) { /* about to fire */ cval = jiffies_to_cputime(1); } else { @@ -99,7 +91,6 @@ int do_getitimer(int which, struct itimerval *value) } } spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); cputime_to_timeval(cval, &value->it_value); cputime_to_timeval(cinterval, &value->it_interval); break; @@ -185,7 +176,6 @@ again: case ITIMER_VIRTUAL: nval = timeval_to_cputime(&value->it_value); ninterval = timeval_to_cputime(&value->it_interval); - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_virt_expires; cinterval = tsk->signal->it_virt_incr; @@ -200,7 +190,6 @@ again: tsk->signal->it_virt_expires = nval; tsk->signal->it_virt_incr = ninterval; spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); if (ovalue) { cputime_to_timeval(cval, &ovalue->it_value); cputime_to_timeval(cinterval, &ovalue->it_interval); @@ -209,7 +198,6 @@ again: case ITIMER_PROF: nval = timeval_to_cputime(&value->it_value); ninterval = timeval_to_cputime(&value->it_interval); - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_prof_expires; cinterval = tsk->signal->it_prof_incr; @@ -224,7 +212,6 @@ again: tsk->signal->it_prof_expires = nval; tsk->signal->it_prof_incr = ninterval; spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); if (ovalue) { cputime_to_timeval(cval, &ovalue->it_value); cputime_to_timeval(cinterval, &ovalue->it_interval); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c42a03a..dba1c33 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -8,6 +8,99 @@ #include #include +#ifdef CONFIG_SMP +/* + * Allocate the thread_group_cputime structure appropriately for SMP kernels + * and fill in the current values of the fields. Called from copy_signal() + * via thread_group_cputime_clone_thread() when adding a second or subsequent + * thread to a thread group. Assumes interrupts are enabled when called. + */ +int thread_group_cputime_alloc_smp(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + struct task_cputime *cputime; + + /* + * If we have multiple threads and we don't already have a + * per-CPU task_cputime struct, allocate one and fill it in with + * the times accumulated so far. + */ + if (sig->cputime.totals) + return 0; + cputime = alloc_percpu(struct task_cputime); + if (cputime == NULL) + return -ENOMEM; + read_lock(&tasklist_lock); + spin_lock_irq(&tsk->sighand->siglock); + if (sig->cputime.totals) { + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + free_percpu(cputime); + return 0; + } + sig->cputime.totals = cputime; + cputime = per_cpu_ptr(sig->cputime.totals, get_cpu()); + cputime->utime = tsk->utime; + cputime->stime = tsk->stime; + cputime->sum_exec_runtime = tsk->se.sum_exec_runtime; + put_cpu_no_resched(); + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + return 0; +} + +/** + * thread_group_cputime_smp - Sum the thread group time fields across all CPUs. + * + * @tsk: The task we use to identify the thread group. + * @times: task_cputime structure in which we return the summed fields. + * + * Walk the list of CPUs to sum the per-CPU time fields in the thread group + * time structure. + */ +void thread_group_cputime_smp( + struct task_struct *tsk, + struct task_cputime *times) +{ + struct signal_struct *sig; + int i; + struct task_cputime *tot; + + sig = tsk->signal; + if (unlikely(!sig) || !sig->cputime.totals) { + times->utime = tsk->utime; + times->stime = tsk->stime; + times->sum_exec_runtime = tsk->se.sum_exec_runtime; + return; + } + times->stime = times->utime = cputime_zero; + times->sum_exec_runtime = 0; + for_each_possible_cpu(i) { + tot = per_cpu_ptr(tsk->signal->cputime.totals, i); + times->utime = cputime_add(times->utime, tot->utime); + times->stime = cputime_add(times->stime, tot->stime); + times->sum_exec_runtime += tot->sum_exec_runtime; + } +} + +#endif /* CONFIG_SMP */ + +/* + * Called after updating RLIMIT_CPU to set timer expiration if necessary. + */ +void update_rlimit_cpu(unsigned long rlim_new) +{ + cputime_t cputime; + + cputime = secs_to_cputime(rlim_new); + if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || + cputime_lt(current->signal->it_prof_expires, cputime)) { + spin_lock_irq(¤t->sighand->siglock); + set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); + spin_unlock_irq(¤t->sighand->siglock); + } +} + static int check_clock(const clockid_t which_clock) { int error = 0; @@ -158,10 +251,6 @@ static inline cputime_t virt_ticks(struct task_struct *p) { return p->utime; } -static inline unsigned long long sched_ns(struct task_struct *p) -{ - return task_sched_runtime(p); -} int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) { @@ -211,7 +300,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, cpu->cpu = virt_ticks(p); break; case CPUCLOCK_SCHED: - cpu->sched = sched_ns(p); + cpu->sched = task_sched_runtime(p); break; } return 0; @@ -226,31 +315,20 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx, struct task_struct *p, union cpu_time_count *cpu) { - struct task_struct *t = p; - switch (clock_idx) { + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + switch (clock_idx) { default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = cputime_add(p->signal->utime, p->signal->stime); - do { - cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t)); - t = next_thread(t); - } while (t != p); + cpu->cpu = cputime_add(cputime.utime, cputime.stime); break; case CPUCLOCK_VIRT: - cpu->cpu = p->signal->utime; - do { - cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t)); - t = next_thread(t); - } while (t != p); + cpu->cpu = cputime.utime; break; case CPUCLOCK_SCHED: - cpu->sched = p->signal->sum_sched_runtime; - /* Add in each other live thread. */ - while ((t = next_thread(t)) != p) { - cpu->sched += t->se.sum_exec_runtime; - } - cpu->sched += sched_ns(p); + cpu->sched = thread_group_sched_runtime(p); break; } return 0; @@ -471,80 +549,11 @@ void posix_cpu_timers_exit(struct task_struct *tsk) } void posix_cpu_timers_exit_group(struct task_struct *tsk) { - cleanup_timers(tsk->signal->cpu_timers, - cputime_add(tsk->utime, tsk->signal->utime), - cputime_add(tsk->stime, tsk->signal->stime), - tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime); -} - - -/* - * Set the expiry times of all the threads in the process so one of them - * will go off before the process cumulative expiry total is reached. - */ -static void process_timer_rebalance(struct task_struct *p, - unsigned int clock_idx, - union cpu_time_count expires, - union cpu_time_count val) -{ - cputime_t ticks, left; - unsigned long long ns, nsleft; - struct task_struct *t = p; - unsigned int nthreads = atomic_read(&p->signal->live); - - if (!nthreads) - return; + struct task_cputime cputime; - switch (clock_idx) { - default: - BUG(); - break; - case CPUCLOCK_PROF: - left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), - nthreads); - do { - if (likely(!(t->flags & PF_EXITING))) { - ticks = cputime_add(prof_ticks(t), left); - if (cputime_eq(t->it_prof_expires, - cputime_zero) || - cputime_gt(t->it_prof_expires, ticks)) { - t->it_prof_expires = ticks; - } - } - t = next_thread(t); - } while (t != p); - break; - case CPUCLOCK_VIRT: - left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), - nthreads); - do { - if (likely(!(t->flags & PF_EXITING))) { - ticks = cputime_add(virt_ticks(t), left); - if (cputime_eq(t->it_virt_expires, - cputime_zero) || - cputime_gt(t->it_virt_expires, ticks)) { - t->it_virt_expires = ticks; - } - } - t = next_thread(t); - } while (t != p); - break; - case CPUCLOCK_SCHED: - nsleft = expires.sched - val.sched; - do_div(nsleft, nthreads); - nsleft = max_t(unsigned long long, nsleft, 1); - do { - if (likely(!(t->flags & PF_EXITING))) { - ns = t->se.sum_exec_runtime + nsleft; - if (t->it_sched_expires == 0 || - t->it_sched_expires > ns) { - t->it_sched_expires = ns; - } - } - t = next_thread(t); - } while (t != p); - break; - } + thread_group_cputime(tsk, &cputime); + cleanup_timers(tsk->signal->cpu_timers, + cputime.utime, cputime.stime, cputime.sum_exec_runtime); } static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) @@ -608,29 +617,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) default: BUG(); case CPUCLOCK_PROF: - if (cputime_eq(p->it_prof_expires, + if (cputime_eq(p->cputime_expires.prof_exp, cputime_zero) || - cputime_gt(p->it_prof_expires, + cputime_gt(p->cputime_expires.prof_exp, nt->expires.cpu)) - p->it_prof_expires = nt->expires.cpu; + p->cputime_expires.prof_exp = + nt->expires.cpu; break; case CPUCLOCK_VIRT: - if (cputime_eq(p->it_virt_expires, + if (cputime_eq(p->cputime_expires.virt_exp, cputime_zero) || - cputime_gt(p->it_virt_expires, + cputime_gt(p->cputime_expires.virt_exp, nt->expires.cpu)) - p->it_virt_expires = nt->expires.cpu; + p->cputime_expires.virt_exp = + nt->expires.cpu; break; case CPUCLOCK_SCHED: - if (p->it_sched_expires == 0 || - p->it_sched_expires > nt->expires.sched) - p->it_sched_expires = nt->expires.sched; + if (p->cputime_expires.sched_exp == 0 || + p->cputime_expires.sched_exp > + nt->expires.sched) + p->cputime_expires.sched_exp = + nt->expires.sched; break; } } else { /* - * For a process timer, we must balance - * all the live threads' expirations. + * For a process timer, set the cached expiration time. */ switch (CPUCLOCK_WHICH(timer->it_clock)) { default: @@ -641,7 +653,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) cputime_lt(p->signal->it_virt_expires, timer->it.cpu.expires.cpu)) break; - goto rebalance; + p->signal->cputime_expires.virt_exp = + timer->it.cpu.expires.cpu; + break; case CPUCLOCK_PROF: if (!cputime_eq(p->signal->it_prof_expires, cputime_zero) && @@ -652,13 +666,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) if (i != RLIM_INFINITY && i <= cputime_to_secs(timer->it.cpu.expires.cpu)) break; - goto rebalance; + p->signal->cputime_expires.prof_exp = + timer->it.cpu.expires.cpu; + break; case CPUCLOCK_SCHED: - rebalance: - process_timer_rebalance( - timer->it.cpu.task, - CPUCLOCK_WHICH(timer->it_clock), - timer->it.cpu.expires, now); + p->signal->cputime_expires.sched_exp = + timer->it.cpu.expires.sched; break; } } @@ -969,13 +982,13 @@ static void check_thread_timers(struct task_struct *tsk, struct signal_struct *const sig = tsk->signal; maxfire = 20; - tsk->it_prof_expires = cputime_zero; + tsk->cputime_expires.prof_exp = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { - tsk->it_prof_expires = t->expires.cpu; + tsk->cputime_expires.prof_exp = t->expires.cpu; break; } t->firing = 1; @@ -984,13 +997,13 @@ static void check_thread_timers(struct task_struct *tsk, ++timers; maxfire = 20; - tsk->it_virt_expires = cputime_zero; + tsk->cputime_expires.virt_exp = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { - tsk->it_virt_expires = t->expires.cpu; + tsk->cputime_expires.virt_exp = t->expires.cpu; break; } t->firing = 1; @@ -999,13 +1012,13 @@ static void check_thread_timers(struct task_struct *tsk, ++timers; maxfire = 20; - tsk->it_sched_expires = 0; + tsk->cputime_expires.sched_exp = 0; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { - tsk->it_sched_expires = t->expires.sched; + tsk->cputime_expires.sched_exp = t->expires.sched; break; } t->firing = 1; @@ -1055,10 +1068,10 @@ static void check_process_timers(struct task_struct *tsk, { int maxfire; struct signal_struct *const sig = tsk->signal; - cputime_t utime, stime, ptime, virt_expires, prof_expires; + cputime_t utime, ptime, virt_expires, prof_expires; unsigned long long sum_sched_runtime, sched_expires; - struct task_struct *t; struct list_head *timers = sig->cpu_timers; + struct task_cputime cputime; /* * Don't sample the current process CPU clocks if there are no timers. @@ -1074,18 +1087,10 @@ static void check_process_timers(struct task_struct *tsk, /* * Collect the current process totals. */ - utime = sig->utime; - stime = sig->stime; - sum_sched_runtime = sig->sum_sched_runtime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - sum_sched_runtime += t->se.sum_exec_runtime; - t = next_thread(t); - } while (t != tsk); - ptime = cputime_add(utime, stime); - + thread_group_cputime(tsk, &cputime); + utime = cputime.utime; + ptime = cputime_add(utime, cputime.stime); + sum_sched_runtime = cputime.sum_exec_runtime; maxfire = 20; prof_expires = cputime_zero; while (!list_empty(timers)) { @@ -1193,60 +1198,18 @@ static void check_process_timers(struct task_struct *tsk, } } - if (!cputime_eq(prof_expires, cputime_zero) || - !cputime_eq(virt_expires, cputime_zero) || - sched_expires != 0) { - /* - * Rebalance the threads' expiry times for the remaining - * process CPU timers. - */ - - cputime_t prof_left, virt_left, ticks; - unsigned long long sched_left, sched; - const unsigned int nthreads = atomic_read(&sig->live); - - if (!nthreads) - return; - - prof_left = cputime_sub(prof_expires, utime); - prof_left = cputime_sub(prof_left, stime); - prof_left = cputime_div_non_zero(prof_left, nthreads); - virt_left = cputime_sub(virt_expires, utime); - virt_left = cputime_div_non_zero(virt_left, nthreads); - if (sched_expires) { - sched_left = sched_expires - sum_sched_runtime; - do_div(sched_left, nthreads); - sched_left = max_t(unsigned long long, sched_left, 1); - } else { - sched_left = 0; - } - t = tsk; - do { - if (unlikely(t->flags & PF_EXITING)) - continue; - - ticks = cputime_add(cputime_add(t->utime, t->stime), - prof_left); - if (!cputime_eq(prof_expires, cputime_zero) && - (cputime_eq(t->it_prof_expires, cputime_zero) || - cputime_gt(t->it_prof_expires, ticks))) { - t->it_prof_expires = ticks; - } - - ticks = cputime_add(t->utime, virt_left); - if (!cputime_eq(virt_expires, cputime_zero) && - (cputime_eq(t->it_virt_expires, cputime_zero) || - cputime_gt(t->it_virt_expires, ticks))) { - t->it_virt_expires = ticks; - } - - sched = t->se.sum_exec_runtime + sched_left; - if (sched_expires && (t->it_sched_expires == 0 || - t->it_sched_expires > sched)) { - t->it_sched_expires = sched; - } - } while ((t = next_thread(t)) != tsk); - } + if (!cputime_eq(prof_expires, cputime_zero) && + (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) || + cputime_gt(sig->cputime_expires.prof_exp, prof_expires))) + sig->cputime_expires.prof_exp = prof_expires; + if (!cputime_eq(virt_expires, cputime_zero) && + (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) || + cputime_gt(sig->cputime_expires.virt_exp, virt_expires))) + sig->cputime_expires.virt_exp = virt_expires; + if (sched_expires != 0 && + (sig->cputime_expires.sched_exp == 0 || + sig->cputime_expires.sched_exp > sched_expires)) + sig->cputime_expires.sched_exp = sched_expires; } /* @@ -1314,6 +1277,78 @@ out: ++timer->it_requeue_pending; } +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime: The struct to compare. + * + * Checks @cputime to see if all fields are zero. Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ + if (cputime_eq(cputime->utime, cputime_zero) && + cputime_eq(cputime->stime, cputime_zero) && + cputime->sum_exec_runtime == 0) + return 1; + return 0; +} + +/** + * task_cputime_expired - Compare two task_cputime entities. + * + * @sample: The task_cputime structure to be checked for expiration. + * @expires: Expiration times, against which @sample will be checked. + * + * Checks @sample against @expires to see if any field of @sample has expired. + * Returns true if any field of the former is greater than the corresponding + * field of the latter if the latter field is set. Otherwise returns false. + */ +static inline int task_cputime_expired(const struct task_cputime *sample, + const struct task_cputime *expires) +{ + if (!cputime_eq(expires->utime, cputime_zero) && + cputime_ge(sample->utime, expires->utime)) + return 1; + if (!cputime_eq(expires->stime, cputime_zero) && + cputime_ge(cputime_add(sample->utime, sample->stime), + expires->stime)) + return 1; + if (expires->sum_exec_runtime != 0 && + sample->sum_exec_runtime >= expires->sum_exec_runtime) + return 1; + return 0; +} + +/** + * fastpath_timer_check - POSIX CPU timers fast path. + * + * @tsk: The task (thread) being checked. + * @sig: The signal pointer for that task. + * + * If there are no timers set return false. Otherwise snapshot the task and + * thread group timers, then compare them with the corresponding expiration + # times. Returns true if a timer has expired, else returns false. + */ +static inline int fastpath_timer_check(struct task_struct *tsk, + struct signal_struct *sig) +{ + struct task_cputime task_sample = { + .utime = tsk->utime, + .stime = tsk->stime, + .sum_exec_runtime = tsk->se.sum_exec_runtime + }; + struct task_cputime group_sample; + + if (task_cputime_zero(&tsk->cputime_expires) && + task_cputime_zero(&sig->cputime_expires)) + return 0; + if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) + return 1; + thread_group_cputime(tsk, &group_sample); + return task_cputime_expired(&group_sample, &sig->cputime_expires); +} + /* * This is called from the timer interrupt handler. The irq handler has * already updated our counts. We need to check if any timers fire now. @@ -1323,30 +1358,29 @@ void run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; + struct signal_struct *sig; + struct sighand_struct *sighand; + unsigned long flags; BUG_ON(!irqs_disabled()); -#define UNEXPIRED(clock) \ - (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ - cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) - - if (UNEXPIRED(prof) && UNEXPIRED(virt) && - (tsk->it_sched_expires == 0 || - tsk->se.sum_exec_runtime < tsk->it_sched_expires)) - return; - -#undef UNEXPIRED - + /* Pick up tsk->signal and make sure it's valid. */ + sig = tsk->signal; /* - * Double-check with locks held. + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. Also check that + * tsk->signal is non-NULL; this probably can't happen but cover the + * possibility anyway. */ - read_lock(&tasklist_lock); - if (likely(tsk->signal != NULL)) { - spin_lock(&tsk->sighand->siglock); - + if (unlikely(!sig) || !fastpath_timer_check(tsk, sig)) { + return; + } + sighand = lock_task_sighand(tsk, &flags); + if (likely(sighand)) { /* - * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] - * all the timers that are firing, and put them on the firing list. + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. */ check_thread_timers(tsk, &firing); check_process_timers(tsk, &firing); @@ -1359,9 +1393,8 @@ void run_posix_cpu_timers(struct task_struct *tsk) * that gets the timer lock before we do will give it up and * spin until we've taken care of that timer below. */ - spin_unlock(&tsk->sighand->siglock); } - read_unlock(&tasklist_lock); + unlock_task_sighand(tsk, &flags); /* * Now that all the timers on our list have the firing flag, @@ -1389,10 +1422,9 @@ void run_posix_cpu_timers(struct task_struct *tsk) /* * Set one of the process-wide special case CPU timers. - * The tasklist_lock and tsk->sighand->siglock must be held by the caller. - * The oldval argument is null for the RLIMIT_CPU timer, where *newval is - * absolute; non-null for ITIMER_*, where *newval is relative and we update - * it to be absolute, *oldval is absolute and we update it to be relative. + * The tsk->sighand->siglock must be held by the caller. + * The *newval argument is relative and we update it to be absolute, *oldval + * is absolute and we update it to be relative. */ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval) @@ -1435,13 +1467,14 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_ge(list_first_entry(head, struct cpu_timer_list, entry)->expires.cpu, *newval)) { - /* - * Rejigger each thread's expiry time so that one will - * notice before we hit the process-cumulative expiry time. - */ - union cpu_time_count expires = { .sched = 0 }; - expires.cpu = *newval; - process_timer_rebalance(tsk, clock_idx, expires, now); + switch (clock_idx) { + case CPUCLOCK_PROF: + tsk->signal->cputime_expires.prof_exp = *newval; + break; + case CPUCLOCK_VIRT: + tsk->signal->cputime_expires.virt_exp = *newval; + break; + } } } diff --git a/kernel/sched.c b/kernel/sched.c index cc1f81b..c51b5d2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4037,23 +4037,56 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* + * Return any ns on the sched_clock that have not yet been banked in + * @p in case that task is currently running. + * + * Called with task_rq_lock() held on @rq. + */ +static unsigned long long task_delta_exec(struct task_struct *p, struct rq *rq) +{ + if (task_current(rq, p)) { + u64 delta_exec; + + update_rq_clock(rq); + delta_exec = rq->clock - p->se.exec_start; + if ((s64)delta_exec > 0) + return delta_exec; + } + return 0; +} + +/* * Return p->sum_exec_runtime plus any more ns on the sched_clock * that have not yet been banked in case the task is currently running. */ unsigned long long task_sched_runtime(struct task_struct *p) { unsigned long flags; - u64 ns, delta_exec; + u64 ns; struct rq *rq; rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime; - if (task_current(rq, p)) { - update_rq_clock(rq); - delta_exec = rq->clock - p->se.exec_start; - if ((s64)delta_exec > 0) - ns += delta_exec; - } + ns = p->se.sum_exec_runtime + task_delta_exec(p, rq); + task_rq_unlock(rq, &flags); + + return ns; +} + +/* + * Return sum_exec_runtime for the thread group plus any more ns on the + * sched_clock that have not yet been banked in case the task is currently + * running. + */ +unsigned long long thread_group_sched_runtime(struct task_struct *p) +{ + unsigned long flags; + u64 ns; + struct rq *rq; + struct task_cputime totals; + + rq = task_rq_lock(p, &flags); + thread_group_cputime(p, &totals); + ns = totals.sum_exec_runtime + task_delta_exec(p, rq); task_rq_unlock(rq, &flags); return ns; @@ -4070,6 +4103,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) cputime64_t tmp; p->utime = cputime_add(p->utime, cputime); + account_group_user_time(p, cputime); /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@ -4094,6 +4128,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) tmp = cputime_to_cputime64(cputime); p->utime = cputime_add(p->utime, cputime); + account_group_user_time(p, cputime); p->gtime = cputime_add(p->gtime, cputime); cpustat->user = cputime64_add(cpustat->user, tmp); @@ -4129,6 +4164,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, } p->stime = cputime_add(p->stime, cputime); + account_group_system_time(p, cputime); /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@ -4170,6 +4206,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal) if (p == rq->idle) { p->stime = cputime_add(p->stime, steal); + account_group_system_time(p, steal); if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fb8994c..99aa31a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -507,6 +507,7 @@ static void update_curr(struct cfs_rq *cfs_rq) struct task_struct *curtask = task_of(curr); cpuacct_charge(curtask, delta_exec); + account_group_exec_runtime(curtask, delta_exec); } } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 5523107..8375e69 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -483,6 +483,8 @@ static void update_curr_rt(struct rq *rq) schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); @@ -1412,7 +1414,7 @@ static void watchdog(struct rq *rq, struct task_struct *p) p->rt.timeout++; next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); if (p->rt.timeout > next) - p->it_sched_expires = p->se.sum_exec_runtime; + p->cputime_expires.sched_exp = p->se.sum_exec_runtime; } } diff --git a/kernel/signal.c b/kernel/signal.c index e661b01..6eea582 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1338,6 +1338,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) struct siginfo info; unsigned long flags; struct sighand_struct *psig; + struct task_cputime cputime; int ret = sig; BUG_ON(sig == -1); @@ -1368,10 +1369,9 @@ int do_notify_parent(struct task_struct *tsk, int sig) info.si_uid = tsk->uid; - info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, - tsk->signal->utime)); - info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, - tsk->signal->stime)); + thread_group_cputime(tsk, &cputime); + info.si_utime = cputime_to_jiffies(cputime.utime); + info.si_stime = cputime_to_jiffies(cputime.stime); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) diff --git a/kernel/sys.c b/kernel/sys.c index 038a7bc..d046a7a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -853,38 +853,28 @@ asmlinkage long sys_setfsgid(gid_t gid) return old_fsgid; } +void do_sys_times(struct tms *tms) +{ + struct task_cputime cputime; + cputime_t cutime, cstime; + + spin_lock_irq(¤t->sighand->siglock); + thread_group_cputime(current, &cputime); + cutime = current->signal->cutime; + cstime = current->signal->cstime; + spin_unlock_irq(¤t->sighand->siglock); + tms->tms_utime = cputime_to_clock_t(cputime.utime); + tms->tms_stime = cputime_to_clock_t(cputime.stime); + tms->tms_cutime = cputime_to_clock_t(cutime); + tms->tms_cstime = cputime_to_clock_t(cstime); +} + asmlinkage long sys_times(struct tms __user * tbuf) { - /* - * In the SMP world we might just be unlucky and have one of - * the times increment as we use it. Since the value is an - * atomically safe type this is just fine. Conceptually its - * as if the syscall took an instant longer to occur. - */ if (tbuf) { struct tms tmp; - struct task_struct *tsk = current; - struct task_struct *t; - cputime_t utime, stime, cutime, cstime; - - spin_lock_irq(&tsk->sighand->siglock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); - - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - - tmp.tms_utime = cputime_to_clock_t(utime); - tmp.tms_stime = cputime_to_clock_t(stime); - tmp.tms_cutime = cputime_to_clock_t(cutime); - tmp.tms_cstime = cputime_to_clock_t(cstime); + + do_sys_times(&tmp); if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } @@ -1445,7 +1435,6 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) { struct rlimit new_rlim, *old_rlim; - unsigned long it_prof_secs; int retval; if (resource >= RLIM_NLIMITS) @@ -1491,18 +1480,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) if (new_rlim.rlim_cur == RLIM_INFINITY) goto out; - it_prof_secs = cputime_to_secs(current->signal->it_prof_expires); - if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) { - unsigned long rlim_cur = new_rlim.rlim_cur; - cputime_t cputime; - - cputime = secs_to_cputime(rlim_cur); - read_lock(&tasklist_lock); - spin_lock_irq(¤t->sighand->siglock); - set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); - spin_unlock_irq(¤t->sighand->siglock); - read_unlock(&tasklist_lock); - } + update_rlimit_cpu(new_rlim.rlim_cur); out: return 0; } @@ -1540,11 +1518,8 @@ out: * */ -static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r, - cputime_t *utimep, cputime_t *stimep) +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) { - *utimep = cputime_add(*utimep, t->utime); - *stimep = cputime_add(*stimep, t->stime); r->ru_nvcsw += t->nvcsw; r->ru_nivcsw += t->nivcsw; r->ru_minflt += t->min_flt; @@ -1558,12 +1533,13 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; cputime_t utime, stime; + struct task_cputime cputime; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; if (who == RUSAGE_THREAD) { - accumulate_thread_rusage(p, r, &utime, &stime); + accumulate_thread_rusage(p, r); goto out; } @@ -1586,8 +1562,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) break; case RUSAGE_SELF: - utime = cputime_add(utime, p->signal->utime); - stime = cputime_add(stime, p->signal->stime); + thread_group_cputime(p, &cputime); + utime = cputime_add(utime, cputime.utime); + stime = cputime_add(stime, cputime.stime); r->ru_nvcsw += p->signal->nvcsw; r->ru_nivcsw += p->signal->nivcsw; r->ru_minflt += p->signal->min_flt; @@ -1596,7 +1573,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_oublock += p->signal->oublock; t = p; do { - accumulate_thread_rusage(t, r, &utime, &stime); + accumulate_thread_rusage(t, r); t = next_thread(t); } while (t != p); break; -- cgit v1.1 From 430b5294bd72c085c730e1e4b86580f164d976bf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Sep 2008 16:33:01 +0200 Subject: timers: fix itimer/many thread hang, fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: kernel/fork.c:843: error: ‘struct signal_struct’ has no member named ‘sum_sched_runtime’ kernel/irq/handle.c:117: warning: ‘sparse_irq_lock’ defined but not used Signed-off-by: Ingo Molnar --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a8ac2ef..1181b9a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -834,7 +834,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; task_io_accounting_init(&sig->ioac); - sig->sum_sched_runtime = 0; INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); INIT_LIST_HEAD(&sig->cpu_timers[2]); -- cgit v1.1 From 5ce73a4a5a4893a1aa4cdeed1b1a5a6de42c43b6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Sep 2008 17:11:46 +0200 Subject: timers: fix itimer/many thread hang, cleanups Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index dba1c33..9a7ea04 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -94,7 +94,7 @@ void update_rlimit_cpu(unsigned long rlim_new) cputime = secs_to_cputime(rlim_new); if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || - cputime_lt(current->signal->it_prof_expires, cputime)) { + cputime_lt(current->signal->it_prof_expires, cputime)) { spin_lock_irq(¤t->sighand->siglock); set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); spin_unlock_irq(¤t->sighand->siglock); @@ -1372,9 +1372,9 @@ void run_posix_cpu_timers(struct task_struct *tsk) * tsk->signal is non-NULL; this probably can't happen but cover the * possibility anyway. */ - if (unlikely(!sig) || !fastpath_timer_check(tsk, sig)) { + if (unlikely(!sig) || !fastpath_timer_check(tsk, sig)) return; - } + sighand = lock_task_sighand(tsk, &flags); if (likely(sighand)) { /* -- cgit v1.1 From 2344abbcbdb82140050e8be29d3d55e4f6fe860b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 Sep 2008 11:32:50 -0700 Subject: clockevents: make device shutdown robust The device shut down does not cleanup the next_event variable of the clock event device. So when the device is reactivated the possible stale next_event value can prevent the device to be reprogrammed as it claims to wait on a event already. This is the root cause of the resurfacing suspend/resume problem, where systems need key press to come back to life. Fix this by setting next_event to KTIME_MAX when the device is shut down. Use a separate function for shutdown which takes care of that and only keep the direct set mode call in the broadcast code, where we can not touch the next_event value. Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 12 +++++++++++- kernel/time/tick-broadcast.c | 9 ++++----- kernel/time/tick-common.c | 4 ++-- kernel/time/tick-internal.h | 2 ++ 4 files changed, 19 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1876b52..f8d9680 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -72,6 +72,16 @@ void clockevents_set_mode(struct clock_event_device *dev, } /** + * clockevents_shutdown - shutdown the device and clear next_event + * @dev: device to shutdown + */ +void clockevents_shutdown(struct clock_event_device *dev) +{ + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + dev->next_event.tv64 = KTIME_MAX; +} + +/** * clockevents_program_event - Reprogram the clock event device. * @expires: absolute expiry time (monotonic clock) * @@ -206,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old, if (new) { BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); - clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(new); } local_irq_restore(flags); } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2f5a382..f1f3eee 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -236,8 +236,7 @@ static void tick_do_broadcast_on_off(void *why) if (!cpu_isset(cpu, tick_broadcast_mask)) { cpu_set(cpu, tick_broadcast_mask); if (td->mode == TICKDEV_MODE_PERIODIC) - clockevents_set_mode(dev, - CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(dev); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) tick_broadcast_force = 1; @@ -254,7 +253,7 @@ static void tick_do_broadcast_on_off(void *why) if (cpus_empty(tick_broadcast_mask)) { if (!bc_stopped) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); } else if (bc_stopped) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); @@ -306,7 +305,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpus_empty(tick_broadcast_mask)) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); } spin_unlock_irqrestore(&tick_broadcast_lock, flags); @@ -321,7 +320,7 @@ void tick_suspend_broadcast(void) bc = tick_broadcast_device.evtdev; if (bc) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index c477719..019315e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -249,7 +249,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) * not give it back to the clockevents layer ! */ if (tick_is_broadcast_device(curdev)) { - clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(curdev); curdev = NULL; } clockevents_exchange_device(curdev, newdev); @@ -311,7 +311,7 @@ static void tick_suspend(void) unsigned long flags; spin_lock_irqsave(&tick_device_lock, flags); - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(td->evtdev); spin_unlock_irqrestore(&tick_device_lock, flags); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 0ffc291..6e9db97 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -10,6 +10,8 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); +extern void clockevents_shutdown(struct clock_event_device *dev); + /* * NO_HZ / high resolution timer shared code */ -- cgit v1.1 From d7cfb60c5cf904ecf1e0ae23ec178175b86f0d4a Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 19 Sep 2008 13:13:44 +0100 Subject: hrtimer: remove hrtimer_clock_base::get_softirq_time() Peter Zijlstra noticed this 8 months ago and I just noticed it again. hrtimer_clock_base::get_softirq_time() is currently unused in the entire tree. In fact, looking at the logs, it appears as if it was never used. Remove it. Signed-off-by: Mark McLoughlin Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 03ea137..4d761d5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1401,9 +1401,7 @@ void hrtimer_run_queues(void) if (!base->first) continue; - if (base->get_softirq_time) - base->softirq_time = base->get_softirq_time(); - else if (gettime) { + if (gettime) { hrtimer_get_softirq_time(cpu_base); gettime = 0; } -- cgit v1.1 From 15afe09bf496ae10c989e1a375a6b5da7bd3e16e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 20 Sep 2008 23:38:02 +0200 Subject: sched: wakeup preempt when small overlap Lin Ming reported a 10% OLTP regression against 2.6.27-rc4. The difference seems to come from different preemption agressiveness, which affects the cache footprint of the workload and its effective cache trashing. Aggresively preempt a task if its avg overlap is very small, this should avoid the task going to sleep and find it still running when we schedule back to it - saving a wakeup. Reported-by: Lin Ming Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++------ kernel/sched_fair.c | 13 ++++++++++--- kernel/sched_features.h | 1 + kernel/sched_idletask.c | 6 +++--- kernel/sched_rt.c | 2 +- 5 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0d8905a..ad9d39b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -604,9 +604,9 @@ struct rq { static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) { - rq->curr->sched_class->check_preempt_curr(rq, p); + rq->curr->sched_class->check_preempt_curr(rq, p, sync); } static inline int cpu_of(struct rq *rq) @@ -2282,7 +2282,7 @@ out_running: trace_mark(kernel_sched_wakeup, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -2417,7 +2417,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) trace_mark(kernel_sched_wakeup_new, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@ -2877,7 +2877,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - check_preempt_curr(this_rq, p); + check_preempt_curr(this_rq, p, 0); } /* @@ -6007,7 +6007,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) set_task_cpu(p, dest_cpu); if (on_rq) { activate_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p); + check_preempt_curr(rq_dest, p, 0); } done: ret = 1; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a10ac0b..7328383 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1331,7 +1331,7 @@ static inline int depth_se(struct sched_entity *se) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); @@ -1367,6 +1367,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (!sched_feat(WAKEUP_PREEMPT)) return; + if (sched_feat(WAKEUP_OVERLAP) && sync && + se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost) { + resched_task(curr); + return; + } + /* * preemption test can be made between sibling entities who are in the * same cfs_rq i.e who have a common parent. Walk up the hierarchy of @@ -1649,7 +1656,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p, if (p->prio > oldprio) resched_task(rq->curr); } else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* @@ -1666,7 +1673,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p, if (running) resched_task(rq->curr); else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* Account for a task changing its policy or group. diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 9353ca7..bf027a7 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) +SCHED_FEAT(WAKEUP_OVERLAP, 1) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 3a4f92d..dec4cca 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) { resched_task(rq->idle); } @@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p, if (running) resched_task(rq->curr); else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } static void prio_changed_idle(struct rq *rq, struct task_struct *p, @@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, if (p->prio > oldprio) resched_task(rq->curr); } else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 5523107..6d2d0a5d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -783,7 +783,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) { if (p->prio < rq->curr->prio) { resched_task(rq->curr); -- cgit v1.1 From f681bbd656b01439be904250a1581ca9c27505a1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 22 Sep 2008 16:29:00 +0200 Subject: sched: turn off WAKEUP_OVERLAP WAKEUP_OVERLAP is not a winner on a 16way box, running psql+sysbench: .27-rc7-NO_WAKEUP_OVERLAP .27-rc7-WAKEUP_OVERLAP ------------------------------------------------- 1: 694 811 +14.39% 2: 1454 1427 -1.86% 4: 3017 3070 +1.70% 8: 5694 5808 +1.96% 16: 10592 10612 +0.19% 32: 9693 9647 -0.48% 64: 8507 8262 -2.97% 128: 8402 7087 -18.55% 256: 8419 5124 -64.30% 512: 7990 3671 -117.62% ------------------------------------------------- SUM: 64466 55524 -16.11% ... so turn it off by default. Signed-off-by: Ingo Molnar --- kernel/sched_features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_features.h b/kernel/sched_features.h index bf027a7..7c9e8f4 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -11,4 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) -SCHED_FEAT(WAKEUP_OVERLAP, 1) +SCHED_FEAT(WAKEUP_OVERLAP, 0) -- cgit v1.1 From caea8a03702c147e8ae90da0801e7ba8297b1d46 Mon Sep 17 00:00:00 2001 From: Chris Friesen Date: Mon, 22 Sep 2008 11:06:09 -0600 Subject: sched: fix list traversal to use _rcu variant load_balance_fair() calls rcu_read_lock() but then traverses the list using the regular list traversal routine. This patch converts the list traversal to use the _rcu version. Signed-off-by: Chris Friesen Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7328383..3b89aa6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1521,7 +1521,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rcu_read_lock(); update_h_load(busiest_cpu); - list_for_each_entry(tg, &task_groups, list) { + list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; -- cgit v1.1 From fa748203175de7c08f2df80e5a0eeca40329b5e2 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Mon, 22 Sep 2008 14:55:45 -0700 Subject: sched: fix init_hrtick() section mismatch warning LD kernel/built-in.o WARNING: kernel/built-in.o(.text+0x326): Section mismatch in reference from the function init_hrtick() to the variable .cpuinit.data:hotplug_hrtick_nb.8 The function init_hrtick() references the variable __cpuinitdata hotplug_hrtick_nb.8. This is often because init_hrtick lacks a __cpuinitdata annotation or the annotation of hotplug_hrtick_nb.8 is wrong. Signed-off-by: Md.Rakib H. Mullick Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9889080..13dd2db 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1087,7 +1087,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_DONE; } -static void init_hrtick(void) +static __init void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } -- cgit v1.1 From 006c75f146e58e080d2b2725a6664f71886e112b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 22 Sep 2008 14:55:46 -0700 Subject: sched: clarify ifdef tangle - Add some comments to try to make the ifdef puzzle a bit clearer - Explicitly inline one of the three init_hrtick() implementations. Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ad9d39b..927c930 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1102,7 +1102,7 @@ static void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); } -static void init_hrtick(void) +static inline void init_hrtick(void) { } #endif /* CONFIG_SMP */ @@ -1121,7 +1121,7 @@ static void init_rq_hrtick(struct rq *rq) rq->hrtick_timer.function = hrtick; rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; } -#else +#else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) { } @@ -1133,7 +1133,7 @@ static inline void init_rq_hrtick(struct rq *rq) static inline void init_hrtick(void) { } -#endif +#endif /* CONFIG_SCHED_HRTICK */ /* * resched_task - mark a task 'to be rescheduled now'. -- cgit v1.1 From 3a72dc8eb5a7122fff439a22bd22486a4fff505c Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 22 Sep 2008 14:55:46 -0700 Subject: rcu: fix sparse shadowed variable warning kernel/rcuclassic.c:564:18: warning: symbol 'flags' shadows an earlier one kernel/rcuclassic.c:527:16: originally declared here Signed-off-by: Harvey Harrison Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 743cf05..ed15128 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -561,15 +561,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, local_irq_restore(flags); if (rcu_batch_after(rdp->batch, rcp->pending)) { - unsigned long flags; + unsigned long flags2; /* and start it/schedule start if it's a new batch */ - spin_lock_irqsave(&rcp->lock, flags); + spin_lock_irqsave(&rcp->lock, flags2); if (rcu_batch_after(rdp->batch, rcp->pending)) { rcp->pending = rdp->batch; rcu_start_batch(rcp); } - spin_unlock_irqrestore(&rcp->lock, flags); + spin_unlock_irqrestore(&rcp->lock, flags2); } } -- cgit v1.1 From 6441402b1f173fa38e561d3cee7c01c32e5281ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 18:46:37 +0200 Subject: clockevents: prevent cpu online to interfere with nohz Impact: rare hang which can be triggered on CPU online. tick_do_timer_cpu keeps track of the CPU which updates jiffies via do_timer. The value -1 is used to signal, that currently no CPU is doing this. There are two cases, where the variable can have this state: boot: necessary for systems where the boot cpu id can be != 0 nohz long idle sleep: When the CPU which did the jiffies update last goes into a long idle sleep it drops the update jiffies duty so another CPU which is not idle can pick it up and keep jiffies going. Using the same value for both situations is wrong, as the CPU online code can see the -1 state when the timer of the newly onlined CPU is setup. The setup for a newly onlined CPU goes through periodic mode and can pick up the do_timer duty without being aware of the nohz / highres mode of the already running system. Use two separate states and make them constants to avoid magic numbers confusion. Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 7 ++++--- kernel/time/tick-internal.h | 4 ++++ kernel/time/tick-sched.c | 8 ++++---- 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 019315e..b523d09 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; -int tick_do_timer_cpu __read_mostly = -1; +int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; DEFINE_SPINLOCK(tick_device_lock); /* @@ -148,7 +148,7 @@ static void tick_setup_device(struct tick_device *td, * If no cpu took the do_timer update, assign it to * this cpu: */ - if (tick_do_timer_cpu == -1) { + if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { tick_do_timer_cpu = cpu; tick_next_period = ktime_get(); tick_period = ktime_set(0, NSEC_PER_SEC / HZ); @@ -300,7 +300,8 @@ static void tick_shutdown(unsigned int *cpup) if (*cpup == tick_do_timer_cpu) { int cpu = first_cpu(cpu_online_map); - tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; + tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : + TICK_DO_TIMER_NONE; } spin_unlock_irqrestore(&tick_device_lock, flags); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 6e9db97..e18014f 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -1,6 +1,10 @@ /* * tick internal variable and functions used by low/high res code */ + +#define TICK_DO_TIMER_NONE -1 +#define TICK_DO_TIMER_BOOT -2 + DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern spinlock_t tick_device_lock; extern ktime_t tick_next_period; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a87b046..31a14e8 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -221,7 +221,7 @@ void tick_nohz_stop_sched_tick(int inidle) */ if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = -1; + tick_do_timer_cpu = TICK_DO_TIMER_NONE; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) @@ -303,7 +303,7 @@ void tick_nohz_stop_sched_tick(int inidle) * invoked. */ if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = -1; + tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->idle_sleeps++; @@ -468,7 +468,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) * this duty, then the jiffies update is still serialized by * xtime_lock. */ - if (unlikely(tick_do_timer_cpu == -1)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; /* Check, if the jiffies need an update */ @@ -570,7 +570,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) * this duty, then the jiffies update is still serialized by * xtime_lock. */ - if (unlikely(tick_do_timer_cpu == -1)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; #endif -- cgit v1.1 From 49d670fb8dd62d3ed4e3ed2513538ea65b051aed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 18:56:01 +0200 Subject: clockevents: prevent stale tick_next_period for onlining CPUs Impact: possible hang on CPU onlining in timer one shot mode. The tick_next_period variable is only used during boot on nohz/highres enabled systems, but for CPU onlining it needs to be maintained when the per cpu clock events device operates in one shot mode. Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 31a14e8..39019b3f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -75,6 +75,9 @@ static void tick_do_update_jiffies64(ktime_t now) incr * ticks); } do_timer(++ticks); + + /* Keep the tick_next_period variable up to date */ + tick_next_period = ktime_add(last_jiffies_update, tick_period); } write_sequnlock(&xtime_lock); } -- cgit v1.1 From 302745699c1b675b5d2a1af87271de10e4d96b6a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 19:02:25 +0200 Subject: clockevents: check broadcast device not tick device Impact: Possible hang on CPU online observed on AMD C1E machines. The broadcast setup code looks at the mode of the tick device to determine whether it needs to be shut down or setup. This is wrong when the broadcast mode is set to one shot already. This can happen when a CPU is brought online as it goes through the periodic setup first. The problem went unnoticed as sane systems do not call into that code before the switch to one shot for the clock event device happens. The AMD C1E idle routine switches over immediately and thereby shuts down the just setup device before the first interrupt happens. Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f1f3eee..e2b66d1 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -235,7 +235,7 @@ static void tick_do_broadcast_on_off(void *why) case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: if (!cpu_isset(cpu, tick_broadcast_mask)) { cpu_set(cpu, tick_broadcast_mask); - if (td->mode == TICKDEV_MODE_PERIODIC) + if (bc->mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) @@ -245,7 +245,7 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_broadcast_force && cpu_isset(cpu, tick_broadcast_mask)) { cpu_clear(cpu, tick_broadcast_mask); - if (td->mode == TICKDEV_MODE_PERIODIC) + if (bc->mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); } break; -- cgit v1.1 From 27ce4cb4a0c7cf59b9a9952266883862f2e4c99f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 19:04:02 +0200 Subject: clockevents: prevent mode mismatch on cpu online Impact: timer hang on CPU online observed on AMD C1E systems When a CPU is brought online then the broadcast machinery can be in the one shot state already. Check this and setup the timer device of the new CPU in one shot mode so the broadcast code can pick up the next_event value correctly. Another AMD C1E oddity, as we switch to broadcast immediately and not after the full bring up via the ACPI cpu idle code. Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 8 ++++++++ kernel/time/tick-common.c | 3 ++- kernel/time/tick-internal.h | 2 ++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e2b66d1..bd70345 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -575,4 +575,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +/* + * Check, whether the broadcast device is in one shot mode + */ +int tick_broadcast_oneshot_active(void) +{ + return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; +} + #endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b523d09..df12434 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) if (!tick_device_is_functional(dev)) return; - if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { + if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && + !tick_broadcast_oneshot_active()) { clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); } else { unsigned long seq; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index e18014f..55c3f4b 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -35,6 +35,7 @@ extern void tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); +extern int tick_broadcast_oneshot_active(void); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { @@ -43,6 +44,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) static inline void tick_broadcast_oneshot_control(unsigned long reason) { } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +static inline int tick_broadcast_oneshot_active(void) { return 0; } # endif /* !BROADCAST */ #else /* !ONESHOT */ -- cgit v1.1 From f8e256c687eb53850685747757c8d75e58756e15 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Sep 2008 13:00:57 +0200 Subject: timers: fix build error in !oneshot case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kernel/time/tick-common.c: In function ‘tick_setup_periodic’: kernel/time/tick-common.c:113: error: implicit declaration of function ‘tick_broadcast_oneshot_active’ Signed-off-by: Ingo Molnar --- kernel/time/tick-internal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 55c3f4b..4692487 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -74,6 +74,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { return 0; } +static inline int tick_broadcast_oneshot_active(void) { return 0; } #endif /* !TICK_ONESHOT */ /* -- cgit v1.1 From bb34d92f643086d546b49cef680f6f305ed84414 Mon Sep 17 00:00:00 2001 From: Frank Mayhar Date: Fri, 12 Sep 2008 09:54:39 -0700 Subject: timers: fix itimer/many thread hang, v2 This is the second resubmission of the posix timer rework patch, posted a few days ago. This includes the changes from the previous resubmittion, which addressed Oleg Nesterov's comments, removing the RCU stuff from the patch and un-inlining the thread_group_cputime() function for SMP. In addition, per Ingo Molnar it simplifies the UP code, consolidating much of it with the SMP version and depending on lower-level SMP/UP handling to take care of the differences. It also cleans up some UP compile errors, moves the scheduler stats-related macros into kernel/sched_stats.h, cleans up a merge error in kernel/fork.c and has a few other minor fixes and cleanups as suggested by Oleg and Ingo. Thanks for the review, guys. Signed-off-by: Frank Mayhar Cc: Roland McGrath Cc: Alexey Dobriyan Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/fork.c | 5 +- kernel/posix-cpu-timers.c | 153 +++++++++++++++++++--------------------------- kernel/sched.c | 47 +++----------- kernel/sched_stats.h | 136 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 208 insertions(+), 133 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1181b9a..021ae01 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -791,7 +791,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) int ret; if (clone_flags & CLONE_THREAD) { - ret = thread_group_cputime_clone_thread(current, tsk); + ret = thread_group_cputime_clone_thread(current); if (likely(!ret)) { atomic_inc(¤t->signal->count); atomic_inc(¤t->signal->live); @@ -834,9 +834,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; task_io_accounting_init(&sig->ioac); - INIT_LIST_HEAD(&sig->cpu_timers[0]); - INIT_LIST_HEAD(&sig->cpu_timers[1]); - INIT_LIST_HEAD(&sig->cpu_timers[2]); taskstats_tgid_init(sig); task_lock(current->group_leader); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 9a7ea04..153dcb2 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -7,50 +7,46 @@ #include #include #include +#include -#ifdef CONFIG_SMP /* - * Allocate the thread_group_cputime structure appropriately for SMP kernels - * and fill in the current values of the fields. Called from copy_signal() - * via thread_group_cputime_clone_thread() when adding a second or subsequent + * Allocate the thread_group_cputime structure appropriately and fill in the + * current values of the fields. Called from copy_signal() via + * thread_group_cputime_clone_thread() when adding a second or subsequent * thread to a thread group. Assumes interrupts are enabled when called. */ -int thread_group_cputime_alloc_smp(struct task_struct *tsk) +int thread_group_cputime_alloc(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; struct task_cputime *cputime; /* * If we have multiple threads and we don't already have a - * per-CPU task_cputime struct, allocate one and fill it in with - * the times accumulated so far. + * per-CPU task_cputime struct (checked in the caller), allocate + * one and fill it in with the times accumulated so far. We may + * race with another thread so recheck after we pick up the sighand + * lock. */ - if (sig->cputime.totals) - return 0; cputime = alloc_percpu(struct task_cputime); if (cputime == NULL) return -ENOMEM; - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); if (sig->cputime.totals) { spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); free_percpu(cputime); return 0; } sig->cputime.totals = cputime; - cputime = per_cpu_ptr(sig->cputime.totals, get_cpu()); + cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id()); cputime->utime = tsk->utime; cputime->stime = tsk->stime; cputime->sum_exec_runtime = tsk->se.sum_exec_runtime; - put_cpu_no_resched(); spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); return 0; } /** - * thread_group_cputime_smp - Sum the thread group time fields across all CPUs. + * thread_group_cputime - Sum the thread group time fields across all CPUs. * * @tsk: The task we use to identify the thread group. * @times: task_cputime structure in which we return the summed fields. @@ -58,7 +54,7 @@ int thread_group_cputime_alloc_smp(struct task_struct *tsk) * Walk the list of CPUs to sum the per-CPU time fields in the thread group * time structure. */ -void thread_group_cputime_smp( +void thread_group_cputime( struct task_struct *tsk, struct task_cputime *times) { @@ -83,8 +79,6 @@ void thread_group_cputime_smp( } } -#endif /* CONFIG_SMP */ - /* * Called after updating RLIMIT_CPU to set timer expiration if necessary. */ @@ -300,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, cpu->cpu = virt_ticks(p); break; case CPUCLOCK_SCHED: - cpu->sched = task_sched_runtime(p); + cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p); break; } return 0; @@ -309,16 +303,15 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, /* * Sample a process (thread group) clock for the given group_leader task. * Must be called with tasklist_lock held for reading. - * Must be called with tasklist_lock held for reading, and p->sighand->siglock. */ -static int cpu_clock_sample_group_locked(unsigned int clock_idx, - struct task_struct *p, - union cpu_time_count *cpu) +static int cpu_clock_sample_group(const clockid_t which_clock, + struct task_struct *p, + union cpu_time_count *cpu) { struct task_cputime cputime; thread_group_cputime(p, &cputime); - switch (clock_idx) { + switch (which_clock) { default: return -EINVAL; case CPUCLOCK_PROF: @@ -328,29 +321,12 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx, cpu->cpu = cputime.utime; break; case CPUCLOCK_SCHED: - cpu->sched = thread_group_sched_runtime(p); + cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); break; } return 0; } -/* - * Sample a process (thread group) clock for the given group_leader task. - * Must be called with tasklist_lock held for reading. - */ -static int cpu_clock_sample_group(const clockid_t which_clock, - struct task_struct *p, - union cpu_time_count *cpu) -{ - int ret; - unsigned long flags; - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p, - cpu); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - return ret; -} - int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { @@ -1324,29 +1300,37 @@ static inline int task_cputime_expired(const struct task_cputime *sample, * fastpath_timer_check - POSIX CPU timers fast path. * * @tsk: The task (thread) being checked. - * @sig: The signal pointer for that task. * - * If there are no timers set return false. Otherwise snapshot the task and - * thread group timers, then compare them with the corresponding expiration - # times. Returns true if a timer has expired, else returns false. + * Check the task and thread group timers. If both are zero (there are no + * timers set) return false. Otherwise snapshot the task and thread group + * timers and compare them with the corresponding expiration times. Return + * true if a timer has expired, else return false. */ -static inline int fastpath_timer_check(struct task_struct *tsk, - struct signal_struct *sig) +static inline int fastpath_timer_check(struct task_struct *tsk) { - struct task_cputime task_sample = { - .utime = tsk->utime, - .stime = tsk->stime, - .sum_exec_runtime = tsk->se.sum_exec_runtime - }; - struct task_cputime group_sample; + struct signal_struct *sig = tsk->signal; - if (task_cputime_zero(&tsk->cputime_expires) && - task_cputime_zero(&sig->cputime_expires)) + if (unlikely(!sig)) return 0; - if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) - return 1; - thread_group_cputime(tsk, &group_sample); - return task_cputime_expired(&group_sample, &sig->cputime_expires); + + if (!task_cputime_zero(&tsk->cputime_expires)) { + struct task_cputime task_sample = { + .utime = tsk->utime, + .stime = tsk->stime, + .sum_exec_runtime = tsk->se.sum_exec_runtime + }; + + if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) + return 1; + } + if (!task_cputime_zero(&sig->cputime_expires)) { + struct task_cputime group_sample; + + thread_group_cputime(tsk, &group_sample); + if (task_cputime_expired(&group_sample, &sig->cputime_expires)) + return 1; + } + return 0; } /* @@ -1358,43 +1342,34 @@ void run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; - struct signal_struct *sig; - struct sighand_struct *sighand; - unsigned long flags; BUG_ON(!irqs_disabled()); - /* Pick up tsk->signal and make sure it's valid. */ - sig = tsk->signal; /* * The fast path checks that there are no expired thread or thread - * group timers. If that's so, just return. Also check that - * tsk->signal is non-NULL; this probably can't happen but cover the - * possibility anyway. + * group timers. If that's so, just return. */ - if (unlikely(!sig) || !fastpath_timer_check(tsk, sig)) + if (!fastpath_timer_check(tsk)) return; - sighand = lock_task_sighand(tsk, &flags); - if (likely(sighand)) { - /* - * Here we take off tsk->signal->cpu_timers[N] and - * tsk->cpu_timers[N] all the timers that are firing, and - * put them on the firing list. - */ - check_thread_timers(tsk, &firing); - check_process_timers(tsk, &firing); + spin_lock(&tsk->sighand->siglock); + /* + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. + */ + check_thread_timers(tsk, &firing); + check_process_timers(tsk, &firing); - /* - * We must release these locks before taking any timer's lock. - * There is a potential race with timer deletion here, as the - * siglock now protects our private firing list. We have set - * the firing flag in each timer, so that a deletion attempt - * that gets the timer lock before we do will give it up and - * spin until we've taken care of that timer below. - */ - } - unlock_task_sighand(tsk, &flags); + /* + * We must release these locks before taking any timer's lock. + * There is a potential race with timer deletion here, as the + * siglock now protects our private firing list. We have set + * the firing flag in each timer, so that a deletion attempt + * that gets the timer lock before we do will give it up and + * spin until we've taken care of that timer below. + */ + spin_unlock(&tsk->sighand->siglock); /* * Now that all the timers on our list have the firing flag, @@ -1433,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); - cpu_clock_sample_group_locked(clock_idx, tsk, &now); + cpu_clock_sample_group(clock_idx, tsk, &now); if (oldval) { if (!cputime_eq(*oldval, cputime_zero)) { diff --git a/kernel/sched.c b/kernel/sched.c index c51b5d2..260c22c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4039,55 +4039,22 @@ EXPORT_PER_CPU_SYMBOL(kstat); /* * Return any ns on the sched_clock that have not yet been banked in * @p in case that task is currently running. - * - * Called with task_rq_lock() held on @rq. */ -static unsigned long long task_delta_exec(struct task_struct *p, struct rq *rq) +unsigned long long task_delta_exec(struct task_struct *p) { + struct rq *rq; + unsigned long flags; + u64 ns = 0; + + rq = task_rq_lock(p, &flags); if (task_current(rq, p)) { u64 delta_exec; update_rq_clock(rq); delta_exec = rq->clock - p->se.exec_start; if ((s64)delta_exec > 0) - return delta_exec; + ns = delta_exec; } - return 0; -} - -/* - * Return p->sum_exec_runtime plus any more ns on the sched_clock - * that have not yet been banked in case the task is currently running. - */ -unsigned long long task_sched_runtime(struct task_struct *p) -{ - unsigned long flags; - u64 ns; - struct rq *rq; - - rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime + task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); - - return ns; -} - -/* - * Return sum_exec_runtime for the thread group plus any more ns on the - * sched_clock that have not yet been banked in case the task is currently - * running. - */ -unsigned long long thread_group_sched_runtime(struct task_struct *p) -{ - unsigned long flags; - u64 ns; - struct rq *rq; - struct task_cputime totals; - - rq = task_rq_lock(p, &flags); - thread_group_cputime(p, &totals); - ns = totals.sum_exec_runtime + task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); return ns; } diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 8385d43..d6903bd 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -270,3 +270,139 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ +/* + * The following are functions that support scheduler-internal time accounting. + * These functions are generally called at the timer tick. None of this depends + * on CONFIG_SCHEDSTATS. + */ + +#ifdef CONFIG_SMP + +/** + * thread_group_cputime_account_user - Maintain utime for a thread group. + * + * @tgtimes: Pointer to thread_group_cputime structure. + * @cputime: Time value by which to increment the utime field of that + * structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the utime field there. + */ +static inline void thread_group_cputime_account_user( + struct thread_group_cputime *tgtimes, + cputime_t cputime) +{ + if (tgtimes->totals) { + struct task_cputime *times; + + times = per_cpu_ptr(tgtimes->totals, get_cpu()); + times->utime = cputime_add(times->utime, cputime); + put_cpu_no_resched(); + } +} + +/** + * thread_group_cputime_account_system - Maintain stime for a thread group. + * + * @tgtimes: Pointer to thread_group_cputime structure. + * @cputime: Time value by which to increment the stime field of that + * structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the stime field there. + */ +static inline void thread_group_cputime_account_system( + struct thread_group_cputime *tgtimes, + cputime_t cputime) +{ + if (tgtimes->totals) { + struct task_cputime *times; + + times = per_cpu_ptr(tgtimes->totals, get_cpu()); + times->stime = cputime_add(times->stime, cputime); + put_cpu_no_resched(); + } +} + +/** + * thread_group_cputime_account_exec_runtime - Maintain exec runtime for a + * thread group. + * + * @tgtimes: Pointer to thread_group_cputime structure. + * @ns: Time value by which to increment the sum_exec_runtime field + * of that structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the sum_exec_runtime field there. + */ +static inline void thread_group_cputime_account_exec_runtime( + struct thread_group_cputime *tgtimes, + unsigned long long ns) +{ + if (tgtimes->totals) { + struct task_cputime *times; + + times = per_cpu_ptr(tgtimes->totals, get_cpu()); + times->sum_exec_runtime += ns; + put_cpu_no_resched(); + } +} + +#else /* CONFIG_SMP */ + +static inline void thread_group_cputime_account_user( + struct thread_group_cputime *tgtimes, + cputime_t cputime) +{ + tgtimes->totals->utime = cputime_add(tgtimes->totals->utime, cputime); +} + +static inline void thread_group_cputime_account_system( + struct thread_group_cputime *tgtimes, + cputime_t cputime) +{ + tgtimes->totals->stime = cputime_add(tgtimes->totals->stime, cputime); +} + +static inline void thread_group_cputime_account_exec_runtime( + struct thread_group_cputime *tgtimes, + unsigned long long ns) +{ + tgtimes->totals->sum_exec_runtime += ns; +} + +#endif /* CONFIG_SMP */ + +/* + * These are the generic time-accounting routines that use the above + * functions. They are the functions actually called by the scheduler. + */ +static inline void account_group_user_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (likely(sig)) + thread_group_cputime_account_user(&sig->cputime, cputime); +} + +static inline void account_group_system_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (likely(sig)) + thread_group_cputime_account_system(&sig->cputime, cputime); +} + +static inline void account_group_exec_runtime(struct task_struct *tsk, + unsigned long long ns) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (likely(sig)) + thread_group_cputime_account_exec_runtime(&sig->cputime, ns); +} -- cgit v1.1 From 695698500912c4479ddf4723e492de3970ff8530 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 14:54:23 +0200 Subject: sched: rework wakeup preemption Rework the wakeup preemption to work on real runtime instead of the virtual runtime. This greatly simplifies the code. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 133 ++-------------------------------------------------- 1 file changed, 4 insertions(+), 129 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3b89aa6..c208997 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -409,64 +409,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) } /* - * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in - * that it favours >=0 over <0. - * - * -20 | - * | - * 0 --------+------- - * .' - * 19 .' - * - */ -static unsigned long -calc_delta_asym(unsigned long delta, struct sched_entity *se) -{ - struct load_weight lw = { - .weight = NICE_0_LOAD, - .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) - }; - - for_each_sched_entity(se) { - struct load_weight *se_lw = &se->load; - unsigned long rw = cfs_rq_of(se)->load.weight; - -#ifdef CONFIG_FAIR_SCHED_GROUP - struct cfs_rq *cfs_rq = se->my_q; - struct task_group *tg = NULL - - if (cfs_rq) - tg = cfs_rq->tg; - - if (tg && tg->shares < NICE_0_LOAD) { - /* - * scale shares to what it would have been had - * tg->weight been NICE_0_LOAD: - * - * weight = 1024 * shares / tg->weight - */ - lw.weight *= se->load.weight; - lw.weight /= tg->shares; - - lw.inv_weight = 0; - - se_lw = &lw; - rw += lw.weight - se->load.weight; - } else -#endif - - if (se->load.weight < NICE_0_LOAD) { - se_lw = &lw; - rw += NICE_0_LOAD - se->load.weight; - } - - delta = calc_delta_mine(delta, rw, se_lw); - } - - return delta; -} - -/* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. */ @@ -1281,54 +1223,12 @@ static unsigned long wakeup_gran(struct sched_entity *se) * + nice tasks. */ if (sched_feat(ASYM_GRAN)) - gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); - else - gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); + gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load); return gran; } /* - * Should 'se' preempt 'curr'. - * - * |s1 - * |s2 - * |s3 - * g - * |<--->|c - * - * w(c, s1) = -1 - * w(c, s2) = 0 - * w(c, s3) = 1 - * - */ -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) -{ - s64 gran, vdiff = curr->vruntime - se->vruntime; - - if (vdiff < 0) - return -1; - - gran = wakeup_gran(curr); - if (vdiff > gran) - return 1; - - return 0; -} - -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - -/* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) @@ -1336,7 +1236,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; - int se_depth, pse_depth; + s64 delta_exec; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -1374,33 +1274,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) return; } - /* - * preemption test can be made between sibling entities who are in the - * same cfs_rq i.e who have a common parent. Walk up the hierarchy of - * both tasks until we find their ancestors who are siblings of common - * parent. - */ - - /* First walk up until both entities are at same depth */ - se_depth = depth_se(se); - pse_depth = depth_se(pse); - - while (se_depth > pse_depth) { - se_depth--; - se = parent_entity(se); - } - - while (pse_depth > se_depth) { - pse_depth--; - pse = parent_entity(pse); - } - - while (!is_same_group(se, pse)) { - se = parent_entity(se); - pse = parent_entity(pse); - } - - if (wakeup_preempt_entity(se, pse) == 1) + delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; + if (delta_exec > wakeup_gran(pse)) resched_task(curr); } -- cgit v1.1 From 940959e93949e839c14f8ddc3b9b0e34a2ab6e29 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 15:33:42 +0200 Subject: sched: fixlet for group load balance We should not only correct the increment for the initial group, but should be consistent and do so for all the groups we encounter. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c208997..0c59da7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1027,7 +1027,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; - long more_w; if (!tg->parent) return wl; @@ -1039,18 +1038,17 @@ static long effective_load(struct task_group *tg, int cpu, if (!wl && sched_feat(ASYM_EFF_LOAD)) return wl; - /* - * Instead of using this increment, also add the difference - * between when the shares were last updated and now. - */ - more_w = se->my_q->load.weight - se->my_q->rq_weight; - wl += more_w; - wg += more_w; - for_each_sched_entity(se) { -#define D(n) (likely(n) ? (n) : 1) - long S, rw, s, a, b; + long more_w; + + /* + * Instead of using this increment, also add the difference + * between when the shares were last updated and now. + */ + more_w = se->my_q->load.weight - se->my_q->rq_weight; + wl += more_w; + wg += more_w; S = se->my_q->tg->shares; s = se->my_q->shares; @@ -1059,7 +1057,11 @@ static long effective_load(struct task_group *tg, int cpu, a = S*(rw + wl); b = S*rw + s*wg; - wl = s*(a-b)/D(b); + wl = s*(a-b); + + if (likely(b)) + wl /= b; + /* * Assume the group is already running and will * thus already be accounted for in the weight. @@ -1068,7 +1070,6 @@ static long effective_load(struct task_group *tg, int cpu, * alter the group weight. */ wg = 0; -#undef D } return wl; -- cgit v1.1 From 78333cdd0e472180743d35988e576d6ecc6f6ddb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 15:33:43 +0200 Subject: sched: add some comments to the bandwidth code Hopefully clarify some of this code a little. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 2e228bd..d570a8c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP +/* + * We ran out of runtime, see if we can borrow some from our neighbours. + */ static int do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq) continue; spin_lock(&iter->rt_runtime_lock); + /* + * Either all rqs have inf runtime and there's nothing to steal + * or __disable_runtime() below sets a specific rq to inf to + * indicate its been disabled and disalow stealing. + */ if (iter->rt_runtime == RUNTIME_INF) goto next; + /* + * From runqueues with spare time, take 1/n part of their + * spare time, but no more than our period. + */ diff = iter->rt_runtime - iter->rt_time; if (diff > 0) { diff = div_u64((u64)diff, weight); @@ -274,6 +286,9 @@ next: return more; } +/* + * Ensure this RQ takes back all the runtime it lend to its neighbours. + */ static void __disable_runtime(struct rq *rq) { struct root_domain *rd = rq->rd; @@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq) spin_lock(&rt_b->rt_runtime_lock); spin_lock(&rt_rq->rt_runtime_lock); + /* + * Either we're all inf and nobody needs to borrow, or we're + * already disabled and thus have nothing to do, or we have + * exactly the right amount of runtime to take out. + */ if (rt_rq->rt_runtime == RUNTIME_INF || rt_rq->rt_runtime == rt_b->rt_runtime) goto balanced; spin_unlock(&rt_rq->rt_runtime_lock); + /* + * Calculate the difference between what we started out with + * and what we current have, that's the amount of runtime + * we lend and now have to reclaim. + */ want = rt_b->rt_runtime - rt_rq->rt_runtime; + /* + * Greedy reclaim, take back as much as we can. + */ for_each_cpu_mask(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; + /* + * Can't reclaim from ourselves or disabled runqueues. + */ if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) continue; @@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq) } spin_lock(&rt_rq->rt_runtime_lock); + /* + * We cannot be left wanting - that would mean some runtime + * leaked out of the system. + */ BUG_ON(want); balanced: + /* + * Disable all the borrow logic by pretending we have inf + * runtime - in which case borrowing doesn't make sense. + */ rt_rq->rt_runtime = RUNTIME_INF; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); @@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq) if (unlikely(!scheduler_running)) return; + /* + * Reset each runqueue's bandwidth settings + */ for_each_leaf_rt_rq(rt_rq, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); -- cgit v1.1 From 4653f803e6e0d970ffeac0efd2c01743eb6c5228 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 15:33:44 +0200 Subject: sched: more sanity checks on the bandwidth settings While playing around with it, I noticed we missed some sanity checks. Also add some comments while we're there. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 669c49a..e1299de 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8866,11 +8866,29 @@ static int tg_schedulable(struct task_group *tg, void *data) runtime = d->rt_runtime; } + /* + * Cannot have more runtime than the period. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; + + /* + * Ensure we don't starve existing RT tasks. + */ if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) return -EBUSY; total = to_ratio(period, runtime); + /* + * Nobody can have more than the global setting allows. + */ + if (total > to_ratio(global_rt_period(), global_rt_runtime())) + return -EINVAL; + + /* + * The sum of our children's runtime should not exceed our own. + */ list_for_each_entry_rcu(child, &tg->children, siblings) { period = ktime_to_ns(child->rt_bandwidth.rt_period); runtime = child->rt_bandwidth.rt_runtime; @@ -8978,19 +8996,24 @@ long sched_group_rt_period(struct task_group *tg) static int sched_rt_global_constraints(void) { - struct task_group *tg = &root_task_group; - u64 rt_runtime, rt_period; + u64 runtime, period; int ret = 0; if (sysctl_sched_rt_period <= 0) return -EINVAL; - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = tg->rt_bandwidth.rt_runtime; + runtime = global_rt_runtime(); + period = global_rt_period(); + + /* + * Sanity check on the sysctl variables. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - ret = __rt_schedulable(tg, rt_period, rt_runtime); + ret = __rt_schedulable(NULL, 0, 0); read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); -- cgit v1.1 From 57fdc26d4a734a3e00c6b2fc0e1e40ff8da4dc31 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 15:33:45 +0200 Subject: sched: fixup buddy selection We should set the buddy even though we might already have the TIF_RESCHED flag set. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c59da7..e3f3c10 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1249,6 +1249,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (unlikely(se == pse)) return; + cfs_rq_of(pse)->next = pse; + /* * We can come here with TIF_NEED_RESCHED already set from new task * wake up path. @@ -1256,8 +1258,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (test_tsk_need_resched(curr)) return; - cfs_rq_of(pse)->next = pse; - /* * Batch tasks do not preempt (their preemption is driven by * the tick): -- cgit v1.1 From f9092f358bc2ec5367621478811f046f82873376 Mon Sep 17 00:00:00 2001 From: Jonathan Steel Date: Mon, 22 Sep 2008 13:57:45 -0700 Subject: kexec: fix segmentation fault in kimage_add_entry A segmentation fault can occur in kimage_add_entry in kexec.c when loading a kernel image into memory. The fault occurs because a page is requested by calling kimage_alloc_page with gfp_mask GFP_KERNEL and the function may actually return a page with gfp_mask GFP_HIGHUSER. The high mem page is returned because it was swapped with the kernel page due to the kernel page being a page that will shortly be copied to. This patch ensures that kimage_alloc_page returns a page that was created with the correct gfp flags. I have verified the change and fixed the whitespace damage of the original patch. Jonathan did a great job of tracking this down after he hit the problem. -- Eric Signed-off-by: Jonathan Steel Signed-off-by: Eric W. Biederman Acked-by: Simon Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 59f3f0df3..aef2653 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -753,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image, *old = addr | (*old & ~PAGE_MASK); /* The old page I have found cannot be a - * destination page, so return it. + * destination page, so return it if it's + * gfp_flags honor the ones passed in. */ + if (!(gfp_mask & __GFP_HIGHMEM) && + PageHighMem(old_page)) { + kimage_free_pages(old_page); + continue; + } addr = old_addr; page = old_page; break; -- cgit v1.1 From 4aa7361179bed905fd0f35b236a5c65db683b9e0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:46 -0700 Subject: posix-timers: don't switch to ->group_leader if ->it_process dies posix_timer_event() drops SIGEV_THREAD_ID and switches to ->group_leader if send_sigqueue() fails. This is not very useful and doesn't work reliably. send_sigqueue() can only fail if ->it_process is dead. But it can die before it dequeues the SI_TIMER signal, in that case the timer stops anyway. Remove this code. I guess it was needed a long ago to ensure that the timer is not destroyed when when its creator thread dies. Q: perhaps it makes sense to change sys_timer_settime() to return an error if ->it_process is dead? Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e36d579..3dfd15a 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -298,6 +298,7 @@ void do_schedule_next_timer(struct siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { + int shared, ret; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->do_schedule_next_timer(). @@ -316,20 +317,10 @@ int posix_timer_event(struct k_itimer *timr, int si_private) timr->sigq->info.si_tid = timr->it_id; timr->sigq->info.si_value = timr->it_sigev_value; - if (timr->it_sigev_notify & SIGEV_THREAD_ID) { - struct task_struct *leader; - int ret = send_sigqueue(timr->sigq, timr->it_process, 0); - - if (likely(ret >= 0)) - return ret; - - timr->it_sigev_notify = SIGEV_SIGNAL; - leader = timr->it_process->group_leader; - put_task_struct(timr->it_process); - timr->it_process = leader; - } - - return send_sigqueue(timr->sigq, timr->it_process, 1); + shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); + ret = send_sigqueue(timr->sigq, timr->it_process, shared); + /* If we failed to send the signal the timer stops. */ + return ret > 0; } EXPORT_SYMBOL_GPL(posix_timer_event); -- cgit v1.1 From 918fc0372831dca73039e1577bfea0c2ce49bdb6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:46 -0700 Subject: posix-timers: always do get_task_struct(timer->it_process) Change the code to get/put timer->it_process regardless of SIGEV_THREAD_ID. This streamlines the create/destroy paths and allows us to simplify the usage of exit_itimers() in de_thread(). Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 3dfd15a..bd9c931 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -540,11 +540,10 @@ sys_timer_create(const clockid_t which_clock, */ spin_lock_irqsave(&process->sighand->siglock, flags); if (!(process->flags & PF_EXITING)) { + get_task_struct(process); new_timer->it_process = process; list_add(&new_timer->list, &process->signal->posix_timers); - if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - get_task_struct(process); spin_unlock_irqrestore(&process->sighand->siglock, flags); } else { spin_unlock_irqrestore(&process->sighand->siglock, flags); @@ -561,6 +560,7 @@ sys_timer_create(const clockid_t which_clock, new_timer->it_sigev_signo = SIGALRM; new_timer->it_sigev_value.sival_int = new_timer->it_id; process = current->group_leader; + get_task_struct(process); spin_lock_irqsave(&process->sighand->siglock, flags); new_timer->it_process = process; list_add(&new_timer->list, &process->signal->posix_timers); @@ -853,8 +853,7 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - put_task_struct(timer->it_process); + put_task_struct(timer->it_process); timer->it_process = NULL; unlock_timer(timer, flags); @@ -881,8 +880,7 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - put_task_struct(timer->it_process); + put_task_struct(timer->it_process); timer->it_process = NULL; unlock_timer(timer, flags); -- cgit v1.1 From 2cd499e38ec241691e4bce50bddc8f57e4cc9bd0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:47 -0700 Subject: posix-timers: sys_timer_create: remove the buggy PF_EXITING check sys_timer_create() return -EINVAL if the target thread has PF_EXITING. This doesn't really make sense, the sub-thread can die right after unlock. And in fact, this is just wrong. Without SIGEV_THREAD_ID good_sigevent() returns ->group_leader, and it is very possible that the leader is already dead. This is OK, we shouldn't return the error in this case. Remove this check and the comment. Note that the "process" was found under tasklist_lock, it must have ->sighand != NULL. Also, remove a couple of unneeded initializations. Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index bd9c931..60b2620 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -460,9 +460,9 @@ sys_timer_create(const clockid_t which_clock, timer_t __user * created_timer_id) { int error = 0; - struct k_itimer *new_timer = NULL; + struct k_itimer *new_timer; int new_timer_id; - struct task_struct *process = NULL; + struct task_struct *process; unsigned long flags; sigevent_t event; int it_id_set = IT_ID_NOT_SET; @@ -523,32 +523,12 @@ sys_timer_create(const clockid_t which_clock, read_lock(&tasklist_lock); if ((process = good_sigevent(&event))) { - /* - * We may be setting up this process for another - * thread. It may be exiting. To catch this - * case the we check the PF_EXITING flag. If - * the flag is not set, the siglock will catch - * him before it is too late (in exit_itimers). - * - * The exec case is a bit more invloved but easy - * to code. If the process is in our thread - * group (and it must be or we would not allow - * it here) and is doing an exec, it will cause - * us to be killed. In this case it will wait - * for us to die which means we can finish this - * linkage with our last gasp. I.e. no code :) - */ + get_task_struct(process); spin_lock_irqsave(&process->sighand->siglock, flags); - if (!(process->flags & PF_EXITING)) { - get_task_struct(process); - new_timer->it_process = process; - list_add(&new_timer->list, - &process->signal->posix_timers); - spin_unlock_irqrestore(&process->sighand->siglock, flags); - } else { - spin_unlock_irqrestore(&process->sighand->siglock, flags); - process = NULL; - } + new_timer->it_process = process; + list_add(&new_timer->list, + &process->signal->posix_timers); + spin_unlock_irqrestore(&process->sighand->siglock, flags); } read_unlock(&tasklist_lock); if (!process) { -- cgit v1.1 From 36b2f046000b358b62b9d116cb10a2b1c5be5cbf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:48 -0700 Subject: posix-timers: sys_timer_create: simplify and s/tasklist/rcu/ - Change the code to do rcu_read_lock() instead of taking tasklist_lock, it is safe to get_task_struct(p) if p was found under RCU. However, now we must not use process's sighand/signal, they may be NULL. We can use current->sighand/signal instead, this "process" must belong to the current's thread-group. - Factor out the common code for 2 "if (timer_event_spec)" branches, the !timer_event_spec case can use current too. - use spin_lock_irq() instead of _irqsave(), kill "flags". Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 60b2620..5b76190 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -463,7 +463,6 @@ sys_timer_create(const clockid_t which_clock, struct k_itimer *new_timer; int new_timer_id; struct task_struct *process; - unsigned long flags; sigevent_t event; int it_id_set = IT_ID_NOT_SET; @@ -521,16 +520,11 @@ sys_timer_create(const clockid_t which_clock, new_timer->it_sigev_signo = event.sigev_signo; new_timer->it_sigev_value = event.sigev_value; - read_lock(&tasklist_lock); - if ((process = good_sigevent(&event))) { + rcu_read_lock(); + process = good_sigevent(&event); + if (process) get_task_struct(process); - spin_lock_irqsave(&process->sighand->siglock, flags); - new_timer->it_process = process; - list_add(&new_timer->list, - &process->signal->posix_timers); - spin_unlock_irqrestore(&process->sighand->siglock, flags); - } - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!process) { error = -EINVAL; goto out; @@ -541,19 +535,18 @@ sys_timer_create(const clockid_t which_clock, new_timer->it_sigev_value.sival_int = new_timer->it_id; process = current->group_leader; get_task_struct(process); - spin_lock_irqsave(&process->sighand->siglock, flags); - new_timer->it_process = process; - list_add(&new_timer->list, &process->signal->posix_timers); - spin_unlock_irqrestore(&process->sighand->siglock, flags); } + spin_lock_irq(¤t->sighand->siglock); + new_timer->it_process = process; + list_add(&new_timer->list, ¤t->signal->posix_timers); + spin_unlock_irq(¤t->sighand->siglock); /* * In the case of the timer belonging to another task, after * the task is unlocked, the timer is owned by the other task * and may cease to exist at any time. Don't use or modify * new_timer after the unlock call. */ - out: if (error) release_posix_timer(new_timer, it_id_set); -- cgit v1.1 From 717835d94d3e3d343a302df0a3cb9405887c3e2a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:49 -0700 Subject: posix-timers: move the initialization of timer->sigq from send to create path posix_timer_event() always populates timer->sigq with the same numbers, move this code into sys_timer_create(). Note that with this patch we can kill it_sigev_signo and it_sigev_value. Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5b76190..c459b29 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -312,11 +312,6 @@ int posix_timer_event(struct k_itimer *timr, int si_private) */ timr->sigq->info.si_sys_private = si_private; - timr->sigq->info.si_signo = timr->it_sigev_signo; - timr->sigq->info.si_code = SI_TIMER; - timr->sigq->info.si_tid = timr->it_id; - timr->sigq->info.si_value = timr->it_sigev_value; - shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); ret = send_sigqueue(timr->sigq, timr->it_process, shared); /* If we failed to send the signal the timer stops. */ @@ -537,6 +532,11 @@ sys_timer_create(const clockid_t which_clock, get_task_struct(process); } + new_timer->sigq->info.si_code = SI_TIMER; + new_timer->sigq->info.si_tid = new_timer->it_id; + new_timer->sigq->info.si_signo = new_timer->it_sigev_signo; + new_timer->sigq->info.si_value = new_timer->it_sigev_value; + spin_lock_irq(¤t->sighand->siglock); new_timer->it_process = process; list_add(&new_timer->list, ¤t->signal->posix_timers); -- cgit v1.1 From ef864c958801768fb28bd3603cd0b098b394671c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:49 -0700 Subject: posix-timers: sys_timer_create: cleanup the error handling Cleanup. - sys_timer_create() is big and complicated. The code above the "out:" label relies on the fact that "error" must be == 0. This is not very robust, make the code more explicit. Remove the unneeded initialization of error. - If idr_get_new() succeeds (as it normally should), we check the returned value twice. Move the "-EAGAIN" check under "if (error)". Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index c459b29..7be385f 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -454,9 +454,8 @@ sys_timer_create(const clockid_t which_clock, struct sigevent __user *timer_event_spec, timer_t __user * created_timer_id) { - int error = 0; struct k_itimer *new_timer; - int new_timer_id; + int error, new_timer_id; struct task_struct *process; sigevent_t event; int it_id_set = IT_ID_NOT_SET; @@ -478,9 +477,9 @@ sys_timer_create(const clockid_t which_clock, error = idr_get_new(&posix_timers_id, (void *) new_timer, &new_timer_id); spin_unlock_irq(&idr_lock); - if (error == -EAGAIN) - goto retry; - else if (error) { + if (error) { + if (error == -EAGAIN) + goto retry; /* * Weird looking, but we return EAGAIN if the IDR is * full (proper POSIX return value for this) @@ -541,6 +540,8 @@ sys_timer_create(const clockid_t which_clock, new_timer->it_process = process; list_add(&new_timer->list, ¤t->signal->posix_timers); spin_unlock_irq(¤t->sighand->siglock); + + return 0; /* * In the case of the timer belonging to another task, after * the task is unlocked, the timer is owned by the other task @@ -548,9 +549,7 @@ sys_timer_create(const clockid_t which_clock, * new_timer after the unlock call. */ out: - if (error) - release_posix_timer(new_timer, it_id_set); - + release_posix_timer(new_timer, it_id_set); return error; } -- cgit v1.1 From 5a9fa73072854981a5c05eb7ba18a96d49c2804f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:50 -0700 Subject: posix-timers: kill ->it_sigev_signo and ->it_sigev_value With the recent changes ->it_sigev_signo and ->it_sigev_value are only used in sys_timer_create(), kill them. Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 7be385f..3eff47b 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -510,10 +510,6 @@ sys_timer_create(const clockid_t which_clock, error = -EFAULT; goto out; } - new_timer->it_sigev_notify = event.sigev_notify; - new_timer->it_sigev_signo = event.sigev_signo; - new_timer->it_sigev_value = event.sigev_value; - rcu_read_lock(); process = good_sigevent(&event); if (process) @@ -524,17 +520,18 @@ sys_timer_create(const clockid_t which_clock, goto out; } } else { - new_timer->it_sigev_notify = SIGEV_SIGNAL; - new_timer->it_sigev_signo = SIGALRM; - new_timer->it_sigev_value.sival_int = new_timer->it_id; + event.sigev_notify = SIGEV_SIGNAL; + event.sigev_signo = SIGALRM; + event.sigev_value.sival_int = new_timer->it_id; process = current->group_leader; get_task_struct(process); } - new_timer->sigq->info.si_code = SI_TIMER; + new_timer->it_sigev_notify = event.sigev_notify; + new_timer->sigq->info.si_signo = event.sigev_signo; + new_timer->sigq->info.si_value = event.sigev_value; new_timer->sigq->info.si_tid = new_timer->it_id; - new_timer->sigq->info.si_signo = new_timer->it_sigev_signo; - new_timer->sigq->info.si_value = new_timer->it_sigev_value; + new_timer->sigq->info.si_code = SI_TIMER; spin_lock_irq(¤t->sighand->siglock); new_timer->it_process = process; -- cgit v1.1 From 5a51b713ccf8835d5adf7217e2f86eb12b1ca851 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:51 -0700 Subject: posix-timers: lock_timer: kill the bogus ->it_id check lock_timer() checks that the timer found by idr_find(timer_id) has ->it_id == timer_id. This buys nothing. This check can fail only if sys_timer_create() unlocked idr_lock after idr_get_new(), but didn't set ->it_id = new_timer_id yet. But in that case ->it_process == NULL so lock_timer() can't succeed anyway. Also remove a couple of unneeded typecasts. Note that with or without this patch we have a small problem. sys_timer_create() doesn't ensure that the result of setting (say) ->it_sigev_notify must be visible if lock_timer() succeeds. Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 3eff47b..7185f05 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -474,8 +474,7 @@ sys_timer_create(const clockid_t which_clock, goto out; } spin_lock_irq(&idr_lock); - error = idr_get_new(&posix_timers_id, (void *) new_timer, - &new_timer_id); + error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id); spin_unlock_irq(&idr_lock); if (error) { if (error == -EAGAIN) @@ -567,12 +566,12 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) */ spin_lock_irqsave(&idr_lock, *flags); - timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); + timr = idr_find(&posix_timers_id, (int) timer_id); if (timr) { spin_lock(&timr->it_lock); - if ((timr->it_id != timer_id) || !(timr->it_process) || - !same_thread_group(timr->it_process, current)) { + if (!timr->it_process || + !same_thread_group(timr->it_process, current)) { spin_unlock(&timr->it_lock); spin_unlock_irqrestore(&idr_lock, *flags); timr = NULL; -- cgit v1.1 From 31d9284569e38fb97117497af3e8047a6a3c86f0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:51 -0700 Subject: posix-timers: lock_timer: make it readable Cleanup. Imho makes the code much more understandable. At least this patch lessens both the source and compiled code. Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 7185f05..95451bf 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -556,7 +556,7 @@ out: * the find to the timer lock. To avoid a dead lock, the timer id MUST * be release with out holding the timer lock. */ -static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; /* @@ -564,23 +564,20 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) * flags part over to the timer lock. Must not let interrupts in * while we are moving the lock. */ - spin_lock_irqsave(&idr_lock, *flags); - timr = idr_find(&posix_timers_id, (int) timer_id); + timr = idr_find(&posix_timers_id, (int)timer_id); if (timr) { spin_lock(&timr->it_lock); - - if (!timr->it_process || - !same_thread_group(timr->it_process, current)) { - spin_unlock(&timr->it_lock); - spin_unlock_irqrestore(&idr_lock, *flags); - timr = NULL; - } else + if (timr->it_process && + same_thread_group(timr->it_process, current)) { spin_unlock(&idr_lock); - } else - spin_unlock_irqrestore(&idr_lock, *flags); + return timr; + } + spin_unlock(&timr->it_lock); + } + spin_unlock_irqrestore(&idr_lock, *flags); - return timr; + return NULL; } /* -- cgit v1.1 From eb3f938fd6292dc79f43a5fe14784b044776e9f0 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 22 Sep 2008 14:42:40 -0700 Subject: ntp: let update_persistent_clock() sleep This is a change that makes the 11-minute RTC update be run in the process context. This is so that update_persistent_clock() can sleep, which may be required for certain types of RTC hardware -- most notably I2C devices. Signed-off-by: Maciej W. Rozycki Cc: Roman Zippel Cc: Rik van Riel Cc: David Brownell Acked-by: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c6921aa1..450a45c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -10,13 +10,13 @@ #include #include -#include #include #include #include #include #include #include +#include #include /* @@ -218,11 +218,11 @@ void second_overflow(void) /* Disable the cmos update - used by virtualization and embedded */ int no_sync_cmos_clock __read_mostly; -static void sync_cmos_clock(unsigned long dummy); +static void sync_cmos_clock(struct work_struct *work); -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); +static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); -static void sync_cmos_clock(unsigned long dummy) +static void sync_cmos_clock(struct work_struct *work) { struct timespec now, next; int fail = 1; @@ -258,13 +258,13 @@ static void sync_cmos_clock(unsigned long dummy) next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } - mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); + schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); } static void notify_cmos_timer(void) { if (!no_sync_cmos_clock) - mod_timer(&sync_cmos_timer, jiffies + 1); + schedule_delayed_work(&sync_cmos_work, 0); } #else -- cgit v1.1 From 5cd1c9c5cf30d4b33df3d3f74d8142f278d536b7 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Mon, 22 Sep 2008 14:42:43 -0700 Subject: timekeeping: fix rounding problem during clock update Due to a rounding problem during a clock update it's possible for readers to observe the clock jumping back by 1nsec. The following simplified example demonstrates the problem: cycle xtime 0 0 1000 999999.6 2000 1999999.2 3000 2999998.8 ... 1500 = 1499999.4 = 0.0 + 1499999.4 = 999999.6 + 499999.8 When reading the clock only the full nanosecond part is used, while timekeeping internally keeps nanosecond fractions. If the clock is now updated at cycle 1500 here, a nanosecond is missing due to the truncation. The simple fix is to round up the xtime value during the update, this also changes the distance to the reference time, but the adjustment will automatically take care that it stays under control. Signed-off-by: Roman Zippel Signed-off-by: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e91c29f..5ecbfc3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -454,7 +454,7 @@ void update_wall_time(void) #else offset = clock->cycle_interval; #endif - clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; + clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. @@ -479,9 +479,12 @@ void update_wall_time(void) /* correct the clock when NTP error is too big */ clocksource_adjust(offset); - /* store full nanoseconds into xtime */ - xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; + /* store full nanoseconds into xtime after rounding it up and + * add the remainder to the error difference. + */ + xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; + clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); update_xtime_cache(cyc2ns(clock, offset)); -- cgit v1.1 From d40e944c25fb4642adb2a4c580a48218a9f3f824 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Mon, 22 Sep 2008 14:42:44 -0700 Subject: ntp: improve adjtimex frequency rounding Change PPM_SCALE_INV_SHIFT so that it doesn't throw away any input bits (19 is the amount of the factor 2 in PPM_SCALE), the output frequency can then be calculated back to its input value, as the inverse divide produce a slightly larger value, which is then correctly rounded by the final shift. Reported-by: Martin Ziegler Signed-off-by: Roman Zippel Cc: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 450a45c..ddb0465 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -406,9 +406,8 @@ adj_done: if (time_status & (STA_UNSYNC|STA_CLOCKERR)) result = TIME_ERROR; - txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) * - (s64)PPM_SCALE_INV, - NTP_SCALE_SHIFT); + txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * + (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; -- cgit v1.1 From b87f17242da6b2ac6db2d179b2f93fb84cff2fbe Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Thu, 25 Sep 2008 09:53:54 +0530 Subject: sched: maintain only task entities in cfs_rq->tasks list cfs_rq->tasks list is used by the load balancer to iterate over all the tasks. Currently it holds all the entities (both task and group entities) because of which there is a need to check for group entities explicitly during load balancing. This patch changes the cfs_rq->tasks list to hold only task entities. Signed-off-by: Bharata B Rao Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e3f3c10..95c1295 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -528,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) inc_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, se->load.weight); + list_add(&se->group_node, &cfs_rq->tasks); + } cfs_rq->nr_running++; se->on_rq = 1; - list_add(&se->group_node, &cfs_rq->tasks); } static void @@ -541,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) dec_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, -se->load.weight); + list_del_init(&se->group_node); + } cfs_rq->nr_running--; se->on_rq = 0; - list_del_init(&se->group_node); } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1335,19 +1337,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) if (next == &cfs_rq->tasks) return NULL; - /* Skip over entities that are not tasks */ - do { - se = list_entry(next, struct sched_entity, group_node); - next = next->next; - } while (next != &cfs_rq->tasks && !entity_is_task(se)); - - if (next == &cfs_rq->tasks && !entity_is_task(se)) - return NULL; - - cfs_rq->balance_iterator = next; - - if (entity_is_task(se)) - p = task_of(se); + se = list_entry(next, struct sched_entity, group_node); + p = task_of(se); + cfs_rq->balance_iterator = next->next; return p; } -- cgit v1.1 From 379daf6290814e41f14880094b7b773640df2461 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 25 Sep 2008 18:43:34 -0700 Subject: IO resources, x86: ioremap sanity check to catch mapping requests exceeding the BAR sizes Go through the iomem resource tree to check if any of the ioremap() requests span more than any slot in the iomem resource tree and do a WARN_ON() if we hit this check. This will raise a red-flag, if some driver is mapping more than what is needed. And hopefully identify possible corruptions much earlier. Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- kernel/resource.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index fc59dcc..1d003a5 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -827,3 +827,36 @@ static int __init reserve_setup(char *str) } __setup("reserve=", reserve_setup); + +/* + * Check if the requested addr and size spans more than any slot in the + * iomem resource tree. + */ +int iomem_map_sanity_check(resource_size_t addr, unsigned long size) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + continue; + if (p->end < addr) + continue; + if (p->start <= addr && (p->end >= addr + size - 1)) + continue; + printk(KERN_WARNING "resource map sanity check conflict: " + "0x%llx 0x%llx 0x%llx 0x%llx %s\n", + addr, addr + size - 1, p->start, p->end, p->name); + err = -1; + break; + } + read_unlock(&resource_lock); + + return err; +} -- cgit v1.1 From 13eb83754b40bf01dc84e52a08d4196d1b719a0e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 26 Sep 2008 10:10:12 +0200 Subject: IO resources, x86: ioremap sanity check to catch mapping requests exceeding, fix fix this build error: kernel/resource.c: In function 'iomem_map_sanity_check': kernel/resource.c:842: error: implicit declaration of function 'r_next' kernel/resource.c:842: warning: assignment makes pointer from integer without a cast r_next() was only available if CONFIG_PROCFS was enabled. and fix this build warning: kernel/resource.c:855: warning: format '%llx' expects type 'long long unsigned int', but argument 2 has type 'resource_size_t' kernel/resource.c:855: warning: format '%llx' expects type 'long long unsigned int', but argument 3 has type 'long unsigned int' kernel/resource.c:855: warning: format '%llx' expects type 'long long unsigned int', but argument 4 has type 'resource_size_t' kernel/resource.c:855: warning: format '%llx' expects type 'long long unsigned int', but argument 5 has type 'resource_size_t' resource_t can be 32 bits. Signed-off-by: Ingo Molnar --- kernel/resource.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 1d003a5..7797dae 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -38,10 +38,6 @@ EXPORT_SYMBOL(iomem_resource); static DEFINE_RWLOCK(resource_lock); -#ifdef CONFIG_PROC_FS - -enum { MAX_IORES_LEVEL = 5 }; - static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -53,6 +49,10 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) return p->sibling; } +#ifdef CONFIG_PROC_FS + +enum { MAX_IORES_LEVEL = 5 }; + static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { @@ -852,7 +852,11 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) continue; printk(KERN_WARNING "resource map sanity check conflict: " "0x%llx 0x%llx 0x%llx 0x%llx %s\n", - addr, addr + size - 1, p->start, p->end, p->name); + (unsigned long long)addr, + (unsigned long long)(addr + size - 1), + (unsigned long long)p->start, + (unsigned long long)p->end, + p->name); err = -1; break; } -- cgit v1.1 From 18d6522b86d21a04c8ac1ea79747e2e434a956d9 Mon Sep 17 00:00:00 2001 From: Atsuo Igarashi Date: Fri, 26 Sep 2008 10:36:41 -0500 Subject: kgdb: could not write to the last of valid memory with kgdb On the ARM architecture, kgdb will crash the kernel if the last byte of valid memory is written due to a flush_icache_range flushing beyond the memory boundary. Signed-off-by: Atsuo Igarashi Signed-off-by: Jason Wessel --- kernel/kgdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index eaa21fc..949806a 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -488,7 +488,7 @@ static int write_mem_msg(int binary) if (err) return err; if (CACHE_FLUSH_IS_SAFE) - flush_icache_range(addr, addr + length + 1); + flush_icache_range(addr, addr + length); return 0; } -- cgit v1.1 From d7161a65341556bacb5e6654e133803f46f51063 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 26 Sep 2008 10:36:41 -0500 Subject: kgdb, x86, arm, mips, powerpc: ignore user space single stepping On the x86 arch, user space single step exceptions should be ignored if they occur in the kernel space, such as ptrace stepping through a system call. First check if it is kgdb that is executing a single step, then ensure it is not an accidental traversal into the user space, while in kgdb, any other time the TIF_SINGLESTEP is set, kgdb should ignore the exception. On x86, arm, mips and powerpc, the kgdb_contthread usage was inconsistent with the way single stepping is implemented in the kgdb core. The arch specific stub should always set the kgdb_cpu_doing_single_step correctly if it is single stepping. This allows kgdb to correctly process an instruction steps if ptrace happens to be requesting an instruction step over a system call. Signed-off-by: Jason Wessel --- kernel/kgdb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 949806a..25d955d 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -1462,7 +1462,7 @@ acquirelock: * Get the passive CPU lock which will hold all the non-primary * CPU in a spin state while the debugger is active */ - if (!kgdb_single_step || !kgdb_contthread) { + if (!kgdb_single_step) { for (i = 0; i < NR_CPUS; i++) atomic_set(&passive_cpu_wait[i], 1); } @@ -1475,7 +1475,7 @@ acquirelock: #ifdef CONFIG_SMP /* Signal the other CPUs to enter kgdb_wait() */ - if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) + if ((!kgdb_single_step) && kgdb_do_roundup) kgdb_roundup_cpus(flags); #endif @@ -1494,7 +1494,7 @@ acquirelock: kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); kgdb_deactivate_sw_breakpoints(); kgdb_single_step = 0; - kgdb_contthread = NULL; + kgdb_contthread = current; exception_level = 0; /* Talk to debugger with gdbserial protocol */ @@ -1508,7 +1508,7 @@ acquirelock: kgdb_info[ks->cpu].task = NULL; atomic_set(&cpu_in_kgdb[ks->cpu], 0); - if (!kgdb_single_step || !kgdb_contthread) { + if (!kgdb_single_step) { for (i = NR_CPUS-1; i >= 0; i--) atomic_set(&passive_cpu_wait[i], 0); /* -- cgit v1.1 From 7086efe1c1536f6bc160e7d60a9bfd645b91f279 Mon Sep 17 00:00:00 2001 From: Frank Mayhar Date: Fri, 12 Sep 2008 09:54:39 -0700 Subject: timers: fix itimer/many thread hang, v3 - fix UP lockup - another set of UP/SMP cleanups and simplifications Signed-off-by: Frank Mayhar Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 - kernel/sched_stats.h | 126 ++++++++++++++++----------------------------------- 2 files changed, 38 insertions(+), 89 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 260c22c..29a3152 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4046,7 +4046,6 @@ unsigned long long task_delta_exec(struct task_struct *p) unsigned long flags; u64 ns = 0; - rq = task_rq_lock(p, &flags); if (task_current(rq, p)) { u64 delta_exec; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index d6903bd..b8c1569 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -276,133 +276,83 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) * on CONFIG_SCHEDSTATS. */ -#ifdef CONFIG_SMP - /** - * thread_group_cputime_account_user - Maintain utime for a thread group. + * account_group_user_time - Maintain utime for a thread group. * - * @tgtimes: Pointer to thread_group_cputime structure. - * @cputime: Time value by which to increment the utime field of that - * structure. + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the utime field of the + * thread_group_cputime structure. * * If thread group time is being maintained, get the structure for the * running CPU and update the utime field there. */ -static inline void thread_group_cputime_account_user( - struct thread_group_cputime *tgtimes, - cputime_t cputime) +static inline void account_group_user_time(struct task_struct *tsk, + cputime_t cputime) { - if (tgtimes->totals) { + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { struct task_cputime *times; - times = per_cpu_ptr(tgtimes->totals, get_cpu()); + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); times->utime = cputime_add(times->utime, cputime); put_cpu_no_resched(); } } /** - * thread_group_cputime_account_system - Maintain stime for a thread group. + * account_group_system_time - Maintain stime for a thread group. * - * @tgtimes: Pointer to thread_group_cputime structure. - * @cputime: Time value by which to increment the stime field of that - * structure. + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the stime field of the + * thread_group_cputime structure. * * If thread group time is being maintained, get the structure for the * running CPU and update the stime field there. */ -static inline void thread_group_cputime_account_system( - struct thread_group_cputime *tgtimes, - cputime_t cputime) +static inline void account_group_system_time(struct task_struct *tsk, + cputime_t cputime) { - if (tgtimes->totals) { + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { struct task_cputime *times; - times = per_cpu_ptr(tgtimes->totals, get_cpu()); + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); times->stime = cputime_add(times->stime, cputime); put_cpu_no_resched(); } } /** - * thread_group_cputime_account_exec_runtime - Maintain exec runtime for a - * thread group. + * account_group_exec_runtime - Maintain exec runtime for a thread group. * - * @tgtimes: Pointer to thread_group_cputime structure. + * @tsk: Pointer to task structure. * @ns: Time value by which to increment the sum_exec_runtime field - * of that structure. + * of the thread_group_cputime structure. * * If thread group time is being maintained, get the structure for the * running CPU and update the sum_exec_runtime field there. */ -static inline void thread_group_cputime_account_exec_runtime( - struct thread_group_cputime *tgtimes, - unsigned long long ns) +static inline void account_group_exec_runtime(struct task_struct *tsk, + unsigned long long ns) { - if (tgtimes->totals) { + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { struct task_cputime *times; - times = per_cpu_ptr(tgtimes->totals, get_cpu()); + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); times->sum_exec_runtime += ns; put_cpu_no_resched(); } } - -#else /* CONFIG_SMP */ - -static inline void thread_group_cputime_account_user( - struct thread_group_cputime *tgtimes, - cputime_t cputime) -{ - tgtimes->totals->utime = cputime_add(tgtimes->totals->utime, cputime); -} - -static inline void thread_group_cputime_account_system( - struct thread_group_cputime *tgtimes, - cputime_t cputime) -{ - tgtimes->totals->stime = cputime_add(tgtimes->totals->stime, cputime); -} - -static inline void thread_group_cputime_account_exec_runtime( - struct thread_group_cputime *tgtimes, - unsigned long long ns) -{ - tgtimes->totals->sum_exec_runtime += ns; -} - -#endif /* CONFIG_SMP */ - -/* - * These are the generic time-accounting routines that use the above - * functions. They are the functions actually called by the scheduler. - */ -static inline void account_group_user_time(struct task_struct *tsk, - cputime_t cputime) -{ - struct signal_struct *sig; - - sig = tsk->signal; - if (likely(sig)) - thread_group_cputime_account_user(&sig->cputime, cputime); -} - -static inline void account_group_system_time(struct task_struct *tsk, - cputime_t cputime) -{ - struct signal_struct *sig; - - sig = tsk->signal; - if (likely(sig)) - thread_group_cputime_account_system(&sig->cputime, cputime); -} - -static inline void account_group_exec_runtime(struct task_struct *tsk, - unsigned long long ns) -{ - struct signal_struct *sig; - - sig = tsk->signal; - if (likely(sig)) - thread_group_cputime_account_exec_runtime(&sig->cputime, ns); -} -- cgit v1.1 From 7659e349672bb0d378ef8d7d62bae4c53d2bdd18 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 14:06:45 +0200 Subject: hrtimer: migrate pending list on cpu offline Impact: hrtimers which are on the pending list are not migrated at cpu offline and can be stale forever Add the pending list migration when CONFIG_HIGH_RES_TIMERS is enabled Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b8e4dce..580bc66 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1610,10 +1610,36 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, } } +#ifdef CONFIG_HIGH_RES_TIMERS +static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, + struct hrtimer_cpu_base *new_base) +{ + struct hrtimer *timer; + int raise = 0; + + while (!list_empty(&old_base->cb_pending)) { + timer = list_entry(old_base->cb_pending.next, + struct hrtimer, cb_entry); + + __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0); + timer->base = &new_base->clock_base[timer->base->index]; + list_add_tail(&timer->cb_entry, &new_base->cb_pending); + raise = 1; + } + return raise; +} +#else +static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, + struct hrtimer_cpu_base *new_base) +{ + return 0; +} +#endif + static void migrate_hrtimers(int cpu) { struct hrtimer_cpu_base *old_base, *new_base; - int i; + int i, raise = 0; BUG_ON(cpu_online(cpu)); old_base = &per_cpu(hrtimer_bases, cpu); @@ -1630,10 +1656,16 @@ static void migrate_hrtimers(int cpu) &new_base->clock_base[i]); } + if (migrate_hrtimer_pending(old_base, new_base)) + raise = 1; + spin_unlock(&old_base->lock); spin_unlock(&new_base->lock); local_irq_enable(); put_cpu_var(hrtimer_bases); + + if (raise) + hrtimer_raise_softirq(); } #endif /* CONFIG_HOTPLUG_CPU */ -- cgit v1.1 From 41e1022eae71707f1ce6801a746f70b1e57b7567 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 14:09:39 +0200 Subject: hrtimer: fix migration of CB_IRQSAFE_NO_SOFTIRQ hrtimers Impact: Stale timers after a CPU went offline. commit 37bb6cb4097e29ffee970065b74499cbf10603a3 hrtimer: unlock hrtimer_wakeup changed the hrtimer sleeper callback mode to CB_IRQSAFE_NO_SOFTIRQ due to locking problems. A result of this change is that when enqueue is called for an already expired hrtimer the callback function is not longer called directly from the enqueue code. The normal callers have been fixed in the code, but the migration code which moves hrtimers from a dead CPU to a live CPU was not made aware of this. This can be fixed by checking the timer state after the call to enqueue in the migration code. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 580bc66..ac2f6d6 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1591,11 +1591,12 @@ static void __cpuinit init_hrtimers_cpu(int cpu) #ifdef CONFIG_HOTPLUG_CPU -static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, +static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct rb_node *node; + int raise = 0; while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); @@ -1607,7 +1608,27 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * Enqueue the timer. Allow reprogramming of the event device */ enqueue_hrtimer(timer, new_base, 1); + +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * Happens with high res enabled when the timer was + * already expired and the callback mode is + * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ + * (hrtimer_sleeper). The enqueue code does not move + * them to the soft irq pending list for + * performance/latency reasons, but in the migration + * state, we need to do that otherwise we end up with + * a stale timer. + */ + if (timer->state == HRTIMER_STATE_INACTIVE) { + timer->state = HRTIMER_STATE_PENDING; + list_add_tail(&timer->cb_entry, + &new_base->cpu_base->cb_pending); + raise = 1; + } +#endif } + return raise; } #ifdef CONFIG_HIGH_RES_TIMERS @@ -1652,8 +1673,9 @@ static void migrate_hrtimers(int cpu) spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); + if (migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i])) + raise = 1; } if (migrate_hrtimer_pending(old_base, new_base)) -- cgit v1.1 From b00c1a99e7758f794923c61e5cd55268d61c9469 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 15:44:46 +0200 Subject: hrtimer: mark migration state Impact: during migration active hrtimers can be seen as inactive The migration code removes the hrtimers from the queues of the dead CPU and sets the state temporary to INACTIVE. The enqueue code sets it to ACTIVE/PENDING again. Prevent that the wrong state can be seen by using a separate migration state bit. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ac2f6d6..ace723d 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1602,7 +1602,13 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, timer = rb_entry(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_hrtimer_deactivate(timer); - __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); + + /* + * Mark it as STATE_MIGRATE not INACTIVE otherwise the + * timer could be seen as !active and just vanish away + * under us on another CPU + */ + __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); timer->base = new_base; /* * Enqueue the timer. Allow reprogramming of the event device @@ -1620,13 +1626,15 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * state, we need to do that otherwise we end up with * a stale timer. */ - if (timer->state == HRTIMER_STATE_INACTIVE) { + if (timer->state == HRTIMER_STATE_MIGRATE) { timer->state = HRTIMER_STATE_PENDING; list_add_tail(&timer->cb_entry, &new_base->cpu_base->cb_pending); raise = 1; } #endif + /* Clear the migration state bit */ + timer->state &= ~HRTIMER_STATE_MIGRATE; } return raise; } -- cgit v1.1 From ccc7dadf736639da86f3e0c86832c11a66fc8221 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 15:47:42 +0200 Subject: hrtimer: prevent migration of per CPU hrtimers Impact: per CPU hrtimers can be migrated from a dead CPU The hrtimer code has no knowledge about per CPU timers, but we need to prevent the migration of such timers and warn when such a timer is active at migration time. Explicitely mark the timers as per CPU and use a more understandable mode descriptor for the interrupts safe unlocked callback mode, which is used by hrtimer_sleeper and the scheduler code. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 37 +++++++++++++++++++++++++------------ kernel/sched.c | 4 ++-- kernel/time/tick-sched.c | 2 +- kernel/trace/trace_sysprof.c | 2 +- 4 files changed, 29 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ace723d..cdec83e 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, */ BUG_ON(timer->function(timer) != HRTIMER_NORESTART); return 1; - case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: + case HRTIMER_CB_IRQSAFE_PERCPU: + case HRTIMER_CB_IRQSAFE_UNLOCKED: /* * This is solely for the sched tick emulation with * dynamic tick support to ensure that we do not * restart the tick right on the edge and end up with * the tick timer in the softirq ! The calling site - * takes care of this. + * takes care of this. Also used for hrtimer sleeper ! */ debug_hrtimer_deactivate(timer); return 1; @@ -1245,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer) timer_stats_account_hrtimer(timer); fn = timer->function; - if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || + timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) { /* * Used for scheduler timers, avoid lock inversion with * rq->lock and tasklist_lock. @@ -1452,7 +1454,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) sl->timer.function = hrtimer_wakeup; sl->task = task; #ifdef CONFIG_HIGH_RES_TIMERS - sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; #endif } @@ -1592,7 +1594,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu) #ifdef CONFIG_HOTPLUG_CPU static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base) + struct hrtimer_clock_base *new_base, int dcpu) { struct hrtimer *timer; struct rb_node *node; @@ -1604,6 +1606,18 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, debug_hrtimer_deactivate(timer); /* + * Should not happen. Per CPU timers should be + * canceled _before_ the migration code is called + */ + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) { + __remove_hrtimer(timer, old_base, + HRTIMER_STATE_INACTIVE, 0); + WARN(1, "hrtimer (%p %p)active but cpu %d dead\n", + timer, timer->function, dcpu); + continue; + } + + /* * Mark it as STATE_MIGRATE not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU @@ -1619,12 +1633,11 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, /* * Happens with high res enabled when the timer was * already expired and the callback mode is - * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ - * (hrtimer_sleeper). The enqueue code does not move - * them to the soft irq pending list for - * performance/latency reasons, but in the migration - * state, we need to do that otherwise we end up with - * a stale timer. + * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The + * enqueue code does not move them to the soft irq + * pending list for performance/latency reasons, but + * in the migration state, we need to do that + * otherwise we end up with a stale timer. */ if (timer->state == HRTIMER_STATE_MIGRATE) { timer->state = HRTIMER_STATE_PENDING; @@ -1682,7 +1695,7 @@ static void migrate_hrtimers(int cpu) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { if (migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i])) + &new_base->clock_base[i], cpu)) raise = 1; } diff --git a/kernel/sched.c b/kernel/sched.c index 13dd2db..ad1962d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -201,7 +201,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rt_b->rt_period_timer.function = sched_rt_period_timer; - rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; } static void start_rt_bandwidth(struct rt_bandwidth *rt_b) @@ -1119,7 +1119,7 @@ static void init_rq_hrtick(struct rq *rq) hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; - rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } #else static inline void hrtick_clear(struct rq *rq) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 39019b3f..cb02324 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -625,7 +625,7 @@ void tick_setup_sched_timer(void) */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ts->sched_timer.function = tick_sched_timer; - ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; /* Get the next period (per cpu) */ ts->sched_timer.expires = tick_init_jiffy_update(); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index bb948e52..db58fb6 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -202,7 +202,7 @@ static void start_stack_timer(int cpu) hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = stack_trace_timer_fn; - hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); } -- cgit v1.1 From 31a78f23bac0069004e69f98808b6988baccb6b6 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Sun, 28 Sep 2008 23:09:31 +0100 Subject: mm owner: fix race between swapoff and exit There's a race between mm->owner assignment and swapoff, more easily seen when task slab poisoning is turned on. The condition occurs when try_to_unuse() runs in parallel with an exiting task. A similar race can occur with callers of get_task_mm(), such as /proc// or ptrace or page migration. CPU0 CPU1 try_to_unuse looks at mm = task0->mm increments mm->mm_users task 0 exits mm->owner needs to be updated, but no new owner is found (mm_users > 1, but no other task has task->mm = task0->mm) mm_update_next_owner() leaves mmput(mm) decrements mm->mm_users task0 freed dereferencing mm->owner fails The fix is to notify the subsystem via mm_owner_changed callback(), if no new owner is found, by specifying the new task as NULL. Jiri Slaby: mm->owner was set to NULL prior to calling cgroup_mm_owner_callbacks(), but must be set after that, so as not to pass NULL as old owner causing oops. Daisuke Nishimura: mm_update_next_owner() may set mm->owner to NULL, but mem_cgroup_from_task() and its callers need to take account of this situation to avoid oops. Hugh Dickins: Lockdep warning and hang below exec_mmap() when testing these patches. exit_mm() up_reads mmap_sem before calling mm_update_next_owner(), so exec_mmap() now needs to do the same. And with that repositioning, there's now no point in mm_need_new_owner() allowing for NULL mm. Reported-by: Hugh Dickins Signed-off-by: Balbir Singh Signed-off-by: Jiri Slaby Signed-off-by: Daisuke Nishimura Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 5 +++-- kernel/exit.c | 12 ++++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 13932abd..a0123d7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2738,14 +2738,15 @@ void cgroup_fork_callbacks(struct task_struct *child) */ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) { - struct cgroup *oldcgrp, *newcgrp; + struct cgroup *oldcgrp, *newcgrp = NULL; if (need_mm_owner_callback) { int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; oldcgrp = task_cgroup(old, ss->subsys_id); - newcgrp = task_cgroup(new, ss->subsys_id); + if (new) + newcgrp = task_cgroup(new, ss->subsys_id); if (oldcgrp == newcgrp) continue; if (ss->mm_owner_changed) diff --git a/kernel/exit.c b/kernel/exit.c index 1639564..85a83c8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -583,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) * If there are other users of the mm and the owner (us) is exiting * we need to find a new owner to take on the responsibility. */ - if (!mm) - return 0; if (atomic_read(&mm->mm_users) <= 1) return 0; if (mm->owner != p) @@ -627,6 +625,16 @@ retry: } while_each_thread(g, c); read_unlock(&tasklist_lock); + /* + * We found no owner yet mm_users > 1: this implies that we are + * most likely racing with swapoff (try_to_unuse()) or /proc or + * ptrace or page migration (get_task_mm()). Mark owner as NULL, + * so that subsystems can understand the callback and take action. + */ + down_write(&mm->mmap_sem); + cgroup_mm_owner_callbacks(mm->owner, NULL); + mm->owner = NULL; + up_write(&mm->mmap_sem); return; assign_new_owner: -- cgit v1.1 From bfcd17a6c5529bc37234cfa720a047cf9397bcfc Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Wed, 6 Aug 2008 15:12:22 +0200 Subject: Configure out file locking features This patch adds the CONFIG_FILE_LOCKING option which allows to remove support for advisory locks. With this patch enabled, the flock() system call, the F_GETLK, F_SETLK and F_SETLKW operations of fcntl() and NFS support are disabled. These features are not necessarly needed on embedded systems. It allows to save ~11 Kb of kernel code and data: text data bss dec hex filename 1125436 118764 212992 1457192 163c28 vmlinux.old 1114299 118564 212992 1445855 160fdf vmlinux -11137 -200 0 -11337 -2C49 +/- This patch has originally been written by Matt Mackall , and is part of the Linux Tiny project. Signed-off-by: Thomas Petazzoni Signed-off-by: Matt Mackall Cc: matthew@wil.cx Cc: linux-fsdevel@vger.kernel.org Cc: mpm@selenic.com Cc: akpm@linux-foundation.org Signed-off-by: J. Bruce Fields --- kernel/sys_ni.c | 1 + kernel/sysctl.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 08d6e1b..503d8d4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -125,6 +125,7 @@ cond_syscall(sys_vm86old); cond_syscall(sys_vm86); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); +cond_syscall(sys_flock); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 50ec088..4588b2c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -97,7 +97,7 @@ static int sixty = 60; static int neg_one = -1; #endif -#ifdef CONFIG_MMU +#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING) static int two = 2; #endif @@ -1261,6 +1261,7 @@ static struct ctl_table fs_table[] = { .extra1 = &minolduid, .extra2 = &maxolduid, }, +#ifdef CONFIG_FILE_LOCKING { .ctl_name = FS_LEASES, .procname = "leases-enable", @@ -1269,6 +1270,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif #ifdef CONFIG_DNOTIFY { .ctl_name = FS_DIR_NOTIFY, @@ -1280,6 +1282,7 @@ static struct ctl_table fs_table[] = { }, #endif #ifdef CONFIG_MMU +#ifdef CONFIG_FILE_LOCKING { .ctl_name = FS_LEASE_TIME, .procname = "lease-break-time", @@ -1291,6 +1294,7 @@ static struct ctl_table fs_table[] = { .extra1 = &zero, .extra2 = &two, }, +#endif { .procname = "aio-nr", .data = &aio_nr, -- cgit v1.1 From 1508487e7f16d992ad23cabd3712563ff912f413 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 30 Sep 2008 08:28:17 +0200 Subject: timers: fix itimer/many thread hang, fix fix bogus rq dereference: v3 removed the locking but also removed the rq initialization. Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 29a3152..ebb03de 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4042,10 +4042,12 @@ EXPORT_PER_CPU_SYMBOL(kstat); */ unsigned long long task_delta_exec(struct task_struct *p) { - struct rq *rq; unsigned long flags; + struct rq *rq; u64 ns = 0; + rq = task_rq_lock(p, &flags); + if (task_current(rq, p)) { u64 delta_exec; @@ -4055,6 +4057,8 @@ unsigned long long task_delta_exec(struct task_struct *p) ns = delta_exec; } + task_rq_unlock(rq, &flags); + return ns; } -- cgit v1.1 From 64b9e0294d24a4204232e13e01630b0690e48d61 Mon Sep 17 00:00:00 2001 From: "Amit K. Arora" Date: Tue, 30 Sep 2008 17:15:39 +0530 Subject: sched: minor optimizations in wake_affine and select_task_rq_fair This patch does following: o Removes unused variable and argument "rq". o Optimizes one of the "if" conditions in wake_affine() - i.e. if "balanced" is true, we need not do rest of the calculations in the condition. o If this cpu is same as the previous cpu (on which woken up task was running when it went to sleep), no need to call wake_affine at all. Signed-off-by: Amit K Arora Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 95c1295..fcbe850a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1088,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif static int -wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, +wake_affine(struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *p, int prev_cpu, int this_cpu, int sync, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) @@ -1136,8 +1136,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || - balanced) { + if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= + tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1156,16 +1156,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync) struct sched_domain *sd, *this_sd = NULL; int prev_cpu, this_cpu, new_cpu; unsigned long load, this_load; - struct rq *rq, *this_rq; + struct rq *this_rq; unsigned int imbalance; int idx; prev_cpu = task_cpu(p); - rq = task_rq(p); this_cpu = smp_processor_id(); this_rq = cpu_rq(this_cpu); new_cpu = prev_cpu; + if (prev_cpu == this_cpu) + goto out; /* * 'this_sd' is the first domain that both * this_cpu and prev_cpu are present in: @@ -1193,13 +1194,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync) load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); - if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, + if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, load, this_load, imbalance)) return this_cpu; - if (prev_cpu == this_cpu) - goto out; - /* * Start passive balancing when half the imbalance_pct * limit is reached. -- cgit v1.1 From 8e85b4b553fc932e1c5141feb5fda389b7f5db01 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 2 Oct 2008 10:50:53 +0200 Subject: softirqs, debug: preemption check if a preempt count leaks out of a softirq handler it can be very hard to figure it out. Add a debug check for this. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/softirq.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 82e32aa..1cf1e2f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -205,7 +205,18 @@ restart: do { if (pending & 1) { + int prev_count = preempt_count(); + h->action(h); + + if (unlikely(prev_count != preempt_count())) { + printk(KERN_ERR "huh, entered sotfirq %ld %p" + "with preempt_count %08x," + " exited with %08x?\n", h - softirq_vec, + h->action, prev_count, preempt_count()); + preempt_count() = prev_count; + } + rcu_bh_qsctr_inc(cpu); } h++; -- cgit v1.1 From aa94fbd5ccd840c8ab26d02439ec799b03a72547 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 2 Oct 2008 14:50:14 -0700 Subject: fix error-path NULL deref in alloc_posix_timer() Found by static checker (http://repo.or.cz/w/smatch.git). Signed-off-by: Dan Carpenter Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/posix-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e36d579..5131e54 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -441,7 +441,7 @@ static struct k_itimer * alloc_posix_timer(void) return tmr; if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { kmem_cache_free(posix_timers_cache, tmr); - tmr = NULL; + return NULL; } memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); return tmr; -- cgit v1.1 From 2133b5d7ff531bc15a923db4a6a50bf96c561be9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Oct 2008 16:06:39 -0700 Subject: rcu: RCU-based detection of stalled CPUs for Classic RCU This patch adds stalled-CPU detection to Classic RCU. This capability is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which defaults disabled. This is a debugging feature to detect infinite loops in kernel code, not something that non-kernel-hackers would be expected to care about. This feature can detect looping CPUs in !PREEMPT builds and looping CPUs with preemption disabled in PREEMPT builds. This is essentially a port of this functionality from the treercu patch, replacing the stall debug patch that is already in tip/core/rcu (commit 67182ae1c4). The changes from the patch in tip/core/rcu include making the config variable name match that in treercu, changing from seconds to jiffies to avoid spurious warnings, and printing a boot message when this feature is enabled. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 166 +++++++++++++++++++++++++++------------------------- 1 file changed, 86 insertions(+), 80 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index ed15128..0d07e6e 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, } } +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; +} + +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + unsigned long flags; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock_irqsave(&rcp->lock, flags); + delta = jiffies - rcp->jiffies_stall; + if (delta < 2 || rcp->cur != rcp->completed) { + spin_unlock_irqrestore(&rcp->lock, flags); + return; + } + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_possible_cpu(cpu) { + if (cpu_isset(cpu, rcp->cpumask)) + printk(" %d", cpu); + } + printk(" (detected by %d, t=%ld jiffies)\n", + smp_processor_id(), (long)(jiffies - rcp->gp_start)); +} + +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", + smp_processor_id(), jiffies, + jiffies - rcp->gp_start); + dump_stack(); + spin_lock_irqsave(&rcp->lock, flags); + if ((long)(jiffies - rcp->jiffies_stall) >= 0) + rcp->jiffies_stall = + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + long delta; + + delta = jiffies - rcp->jiffies_stall; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(rcp); + + } else if (rcp->cur != rcp->completed && delta >= 2) { + + /* They had two seconds to dump stack, so complain. */ + print_other_cpu_stall(rcp); + } +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp) * period (if necessary). */ -#ifdef CONFIG_DEBUG_RCU_STALL - -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) -{ - rcp->gp_check = get_seconds() + 3; -} - -static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) -{ - int cpu; - long delta; - unsigned long flags; - - /* Only let one CPU complain about others per time interval. */ - - spin_lock_irqsave(&rcp->lock, flags); - delta = get_seconds() - rcp->gp_check; - if (delta < 2L || cpus_empty(rcp->cpumask)) { - spin_unlock(&rcp->lock); - return; - } - rcp->gp_check = get_seconds() + 30; - spin_unlock_irqrestore(&rcp->lock, flags); - - /* OK, time to rat on our buddy... */ - - printk(KERN_ERR "RCU detected CPU stalls:"); - for_each_cpu_mask(cpu, rcp->cpumask) - printk(" %d", cpu); - printk(" (detected by %d, t=%lu/%lu)\n", - smp_processor_id(), get_seconds(), rcp->gp_check); -} - -static void print_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long flags; - - printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", - smp_processor_id(), get_seconds(), rcp->gp_check); - dump_stack(); - spin_lock_irqsave(&rcp->lock, flags); - if ((long)(get_seconds() - rcp->gp_check) >= 0L) - rcp->gp_check = get_seconds() + 30; - spin_unlock_irqrestore(&rcp->lock, flags); -} - -static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - long delta; - - delta = get_seconds() - rcp->gp_check; - if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) { - - /* We haven't checked in, so go dump stack. */ - - print_cpu_stall(rcp); - - } else { - if (!cpus_empty(rcp->cpumask) && delta >= 2L) { - /* They had two seconds to dump stack, so complain. */ - print_other_cpu_stall(rcp); - } - } -} - -#else /* #ifdef CONFIG_DEBUG_RCU_STALL */ - -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) -{ -} - -static inline void -check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ -} - -#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */ - /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. @@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { rcp->cur++; - record_gp_check_time(rcp); + record_gp_stall_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { /* Check for CPU stalls, if enabled. */ - check_cpu_stall(rcp, rdp); + check_cpu_stall(rcp); if (rdp->nxtlist) { long completed_snap = ACCESS_ONCE(rcp->completed); @@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = { */ void __init __rcu_init(void) { +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ -- cgit v1.1 From 2ec2b482b10a1ed3493c224f1893cddd3d33833b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Oct 2008 10:41:00 +0200 Subject: rcu: RCU-based detection of stalled CPUs for Classic RCU, fix fix the !CONFIG_RCU_CPU_STALL_DETECTOR path: kernel/rcuclassic.c: In function '__rcu_pending': kernel/rcuclassic.c:609: error: too few arguments to function 'check_cpu_stall' Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 0d07e6e..37f72e551 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -239,7 +239,7 @@ static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) { } -static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp) { } -- cgit v1.1 From 77af7e3403e7314c47b0c07fbc5e4ef21d939532 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 3 Oct 2008 11:39:46 +0200 Subject: softirq, warning fix: correct a format to avoid a warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Last -tip gives this warning: kernel/softirq.c: Dans la fonction «__do_softirq» : kernel/softirq.c:216: attention : format «%ld» expects type «long int», but argument 2 has type «int» This patch corrects the format type, and a small mistake in the "softirq" word. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 1cf1e2f..be7a829 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -210,7 +210,7 @@ restart: h->action(h); if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered sotfirq %ld %p" + printk(KERN_ERR "huh, entered softirq %d %p" "with preempt_count %08x," " exited with %08x?\n", h - softirq_vec, h->action, prev_count, preempt_count()); -- cgit v1.1 From d294eb83d8d39a29f01dad391f15fc3a29aa04f9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 3 Oct 2008 12:10:10 +0200 Subject: cpusets: scan_for_empty_cpusets(), cpuset doesn't seem to be so const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a warning on latest -tip: kernel/cpuset.c: Dans la fonction «scan_for_empty_cpusets» : kernel/cpuset.c:1932: attention : passing argument 1 of «list_add_tail» discards qualifiers from pointer target type Actually the struct cpuset *root passed in parameter to scan_for_empty_cpusets is not supposed to be const since an entry is added on the tail of its list. Just correct the qualifier. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 827cd9ad..eab7bd6 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1921,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * that has tasks along with an empty 'mems'. But if we did see such * a cpuset, we'd handle it just like we do if its 'cpus' was empty. */ -static void scan_for_empty_cpusets(const struct cpuset *root) +static void scan_for_empty_cpusets(struct cpuset *root) { LIST_HEAD(queue); struct cpuset *cp; /* scans cpusets being updated */ -- cgit v1.1 From 07454bfff151d2465ada809bbaddf3548cc1097c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 4 Oct 2008 10:51:07 +0200 Subject: clockevents: check broadcast tick device not the clock events device Impact: jiffies increment too fast. Hugh Dickins noted that with NOHZ=n and HIGHRES=n jiffies get incremented too fast. The reason is a wrong check in the broadcast enter/exit code, which keeps the local apic timer in periodic mode when the switch happens. Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index bd70345..cb01cd8 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -235,7 +235,8 @@ static void tick_do_broadcast_on_off(void *why) case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: if (!cpu_isset(cpu, tick_broadcast_mask)) { cpu_set(cpu, tick_broadcast_mask); - if (bc->mode == TICKDEV_MODE_PERIODIC) + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) @@ -245,7 +246,8 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_broadcast_force && cpu_isset(cpu, tick_broadcast_mask)) { cpu_clear(cpu, tick_broadcast_mask); - if (bc->mode == TICKDEV_MODE_PERIODIC) + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); } break; -- cgit v1.1 From f6121f4f8708195e88cbdf8dd8d171b226b3f858 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Fri, 3 Oct 2008 17:40:46 +0200 Subject: sched_rt.c: resch needed in rt_rq_enqueue() for the root rt_rq While working on the new version of the code for SCHED_SPORADIC I noticed something strange in the present throttling mechanism. More specifically in the throttling timer handler in sched_rt.c (do_sched_rt_period_timer()) and in rt_rq_enqueue(). The problem is that, when unthrottling a runqueue, rt_rq_enqueue() only asks for rescheduling if the runqueue has a sched_entity associated to it (i.e., rt_rq->rt_se != NULL). Now, if the runqueue is the root rq (which has a rt_se = NULL) rescheduling does not take place, and it is delayed to some undefined instant in the future. This imply some random bandwidth usage by the RT tasks under throttling. For instance, setting rt_runtime_us/rt_period_us = 950ms/1000ms an RT task will get less than 95%. In our tests we got something varying between 70% to 95%. Using smaller time values, e.g., 95ms/100ms, things are even worse, and I can see values also going down to 20-25%!! The tests we performed are simply running 'yes' as a SCHED_FIFO task, and checking the CPU usage with top, but we can investigate thoroughly if you think it is needed. Things go much better, for us, with the attached patch... Don't know if it is the best approach, but it solved the issue for us. Signed-off-by: Dario Faggioli Signed-off-by: Michael Trimarchi Acked-by: Peter Zijlstra Cc: Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d570a8c..cdf5740 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { + struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; struct sched_rt_entity *rt_se = rt_rq->rt_se; - if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; - - enqueue_rt_entity(rt_se); + if (rt_rq->rt_nr_running) { + if (rt_se && !on_rt_rq(rt_se)) + enqueue_rt_entity(rt_se); if (rt_rq->highest_prio < curr->prio) resched_task(curr); } -- cgit v1.1 From 34b3ede2353604ec9861c1d900b2a835ff85de47 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 6 Oct 2008 09:27:00 +0800 Subject: sched: remove redundant code in cpu_cgroup_create() css will be initialized by cgroup core. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2caedc4..9715f4c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9088,7 +9088,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (!cgrp->parent) { /* This is early initialization for the top cgroup */ - init_task_group.css.cgroup = cgrp; return &init_task_group.css; } @@ -9097,9 +9096,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); - /* Bind the cgroup to task_group object we just created */ - tg->css.cgroup = cgrp; - return &tg->css; } -- cgit v1.1 From cc1e0f4f7ad95a9eb81e1904cb16068af226180d Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 6 Oct 2008 13:50:59 -0500 Subject: kgdb: call touch_softlockup_watchdog on resume The softlockup watchdog needs to be touched when resuming the from the kgdb stopped state to avoid the printk that a CPU is stuck if the debugger was active for longer than the softlockup threshold. Signed-off-by: Jason Wessel --- kernel/kgdb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 25d955d..e4dcfb2 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -590,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs) /* Signal the primary CPU that we are done: */ atomic_set(&cpu_in_kgdb[cpu], 0); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); } @@ -1432,6 +1433,7 @@ acquirelock: atomic_read(&kgdb_cpu_doing_single_step) != cpu) { atomic_set(&kgdb_active, -1); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); @@ -1524,6 +1526,7 @@ acquirelock: kgdb_restore: /* Free kgdb_active */ atomic_set(&kgdb_active, -1); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); -- cgit v1.1 From 2fb7635c4cea310992a39580133099dd99ad151c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Oct 2008 09:16:04 +0200 Subject: sched: sync wakeups vs avg_overlap While looking at the code I wondered why we always do: sync && avg_overlap < migration_cost Which is a bit odd, since the overlap test was meant to detect sync wakeups so using it to specialize sync wakeups doesn't make much sense. Hence change the code to do: sync || avg_overlap < migration_cost Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fcbe850a..18fd171 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1103,6 +1103,11 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; + if (!sync && sched_feat(SYNC_WAKEUPS) && + curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + sync = 1; + /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -1127,11 +1132,8 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, * a reasonable amount of time then attract this newly * woken task: */ - if (sync && balanced) { - if (curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) - return 1; - } + if (sync && balanced) + return 1; schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); @@ -1268,9 +1270,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (!sched_feat(WAKEUP_PREEMPT)) return; - if (sched_feat(WAKEUP_OVERLAP) && sync && - se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost) { + if (sched_feat(WAKEUP_OVERLAP) && (sync || + (se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost))) { resched_task(curr); return; } -- cgit v1.1 From a5d8c3483a6e19aca95ef6a2c5890e33bfa5b293 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 9 Oct 2008 11:35:51 +0200 Subject: sched debug: add name to sched_domain sysctl entries add /proc/sys/kernel/sched_domain/cpu0/domain0/name, to make it easier to see which specific scheduler domain remained at that entry. Since we process the scheduler domain tree and simplify it, it's not always immediately clear during debugging which domain came from where. depends on CONFIG_SCHED_DEBUG=y. Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9715f4c..6f23059 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6351,7 +6351,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(12); + struct ctl_table *table = sd_alloc_ctl_entry(13); if (table == NULL) return NULL; @@ -6379,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); - /* &table[11] is terminator */ + set_table_entry(&table[11], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[12] is terminator */ return table; } @@ -7263,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(sd, type) sd->name = #type +#else +# define SD_INIT_NAME(sd, type) do { } while (0) +#endif + #define SD_INIT(sd, type) sd_init_##type(sd) + #define SD_INIT_FUNC(type) \ static noinline void sd_init_##type(struct sched_domain *sd) \ { \ memset(sd, 0, sizeof(*sd)); \ *sd = SD_##type##_INIT; \ sd->level = SD_LV_##type; \ + SD_INIT_NAME(sd, type); \ } SD_INIT_FUNC(CPU) -- cgit v1.1 From 8083e4ad970e4eb567e31037060cdd4ba346f0c0 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Mon, 4 Aug 2008 11:59:11 -0700 Subject: [CPUFREQ][5/6] cpufreq: Changes to get_cpu_idle_time_us(), used by ondemand governor export get_cpu_idle_time_us() for it to be used in ondemand governor. Last update time can be current time when the CPU is currently non-idle, accounting for the busy time since last idle. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Dave Jones --- kernel/time/tick-sched.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cb02324..a4d2193 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -190,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - *last_update_time = ktime_to_us(ts->idle_lastupdate); + if (!tick_nohz_enabled) + return -1; + + if (ts->idle_active) + *last_update_time = ktime_to_us(ts->idle_lastupdate); + else + *last_update_time = ktime_to_us(ktime_get()); + return ktime_to_us(ts->idle_sleeptime); } +EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * tick_nohz_stop_sched_tick - stop the idle tick from the idle task -- cgit v1.1 From a6bebbc87a8c16eabb6bd5c6fd2d994be0236fba Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 5 Oct 2008 00:51:15 +0400 Subject: [PATCH] signal, procfs: some lock_task_sighand() users do not need rcu_read_lock() lock_task_sighand() make sure task->sighand is being protected, so we do not need rcu_read_lock(). [ exec() will get task->sighand->siglock before change task->sighand! ] But code using rcu_read_lock() _just_ to protect lock_task_sighand() only appear in procfs. (and some code in procfs use lock_task_sighand() without such redundant protection.) Other subsystem may put lock_task_sighand() into rcu_read_lock() critical region, but these rcu_read_lock() are used for protecting "for_each_process()", "find_task_by_vpid()" etc. , not for protecting lock_task_sighand(). Signed-off-by: Lai Jiangshan [ok from Oleg] Signed-off-by: Alexey Dobriyan --- kernel/sched_debug.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index bbe6b31..ad958c1 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -333,12 +333,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) unsigned long flags; int num_threads = 1; - rcu_read_lock(); if (lock_task_sighand(p, &flags)) { num_threads = atomic_read(&p->signal->count); unlock_task_sighand(p, &flags); } - rcu_read_unlock(); SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); SEQ_printf(m, -- cgit v1.1 From 3bbfe0596746e1590888a6e1e6a07583265238b7 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 10 Oct 2008 03:27:16 +0400 Subject: proc: remove kernel.maps_protect After commit 831830b5a2b5d413407adf380ef62fe17d6fcbf2 aka "restrict reading from /proc//maps to those who share ->mm or can ptrace" sysctl stopped being relevant because commit moved security checks from ->show time to ->start time (mm_for_maps()). Signed-off-by: Alexey Dobriyan Acked-by: Kees Cook --- kernel/sysctl.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 50ec088..cc3e0d7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -80,7 +80,6 @@ extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; -extern int maps_protect; extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifdef CONFIG_RCU_TORTURE_TEST @@ -810,16 +809,6 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif -#ifdef CONFIG_PROC_FS - { - .ctl_name = CTL_UNNUMBERED, - .procname = "maps_protect", - .data = &maps_protect, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif { .ctl_name = CTL_UNNUMBERED, .procname = "poweroff_cmd", -- cgit v1.1 From 5b7dba4ff834259a5623e03a565748704a8fe449 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Thu, 9 Oct 2008 13:21:30 -0500 Subject: sched_clock: prevent scd->clock from moving backwards When sched_clock_cpu() couples the clocks between two cpus, it may increment scd->clock beyond the GTOD tick window that __update_sched_clock() uses to clamp the clock. A later call to __update_sched_clock() may move the clock back to scd->tick_gtod + TICK_NSEC, violating the clock's monotonic property. This patch ensures that scd->clock will not be set backward. Signed-off-by: Dave Kleikamp Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e8ab096..8178724 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -118,13 +118,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) /* * scd->clock = clamp(scd->tick_gtod + delta, - * max(scd->tick_gtod, scd->clock), - * scd->tick_gtod + TICK_NSEC); + * max(scd->tick_gtod, scd->clock), + * max(scd->clock, scd->tick_gtod + TICK_NSEC)); */ clock = scd->tick_gtod + delta; min_clock = wrap_max(scd->tick_gtod, scd->clock); - max_clock = scd->tick_gtod + TICK_NSEC; + max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); -- cgit v1.1 From 061b1bd394ca8628b7c24eb4658ba3535da4249a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 24 Sep 2008 14:46:44 -0700 Subject: Staging: add TAINT_CRAP for all drivers/staging code We need to add a flag for all code that is in the drivers/staging/ directory to prevent all other kernel developers from worrying about issues here, and to notify users that the drivers might not be as good as they are normally used to. Based on code from Andreas Gruenbacher and Jeff Mahoney to provide a TAINT flag for the support level of a kernel module in the Novell enterprise kernel release. This is the kernel portion of this feature, the ability for the flag to be set needs to be done in the build process and will happen in a follow-up patch. Cc: Andreas Gruenbacher Cc: Jeff Mahoney Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 11 +++++++++++ kernel/panic.c | 6 ++++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 9db1191..152b165 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1806,6 +1806,7 @@ static noinline struct module *load_module(void __user *umod, Elf_Ehdr *hdr; Elf_Shdr *sechdrs; char *secstrings, *args, *modmagic, *strtab = NULL; + char *staging; unsigned int i; unsigned int symindex = 0; unsigned int strindex = 0; @@ -1960,6 +1961,14 @@ static noinline struct module *load_module(void __user *umod, goto free_hdr; } + staging = get_modinfo(sechdrs, infoindex, "staging"); + if (staging) { + add_taint_module(mod, TAINT_CRAP); + printk(KERN_WARNING "%s: module is from the staging directory," + " the quality is unknown, you have been warned.\n", + mod->name); + } + /* Now copy in args */ args = strndup_user(uargs, ~0UL >> 1); if (IS_ERR(args)) { @@ -2556,6 +2565,8 @@ static char *module_flags(struct module *mod, char *buf) buf[bx++] = 'P'; if (mod->taints & TAINT_FORCED_MODULE) buf[bx++] = 'F'; + if (mod->taints & TAINT_CRAP) + buf[bx++] = 'C'; /* * TAINT_FORCED_RMMOD: could be added. * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't diff --git a/kernel/panic.c b/kernel/panic.c index 12c5a0a..98e2047 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -155,6 +155,7 @@ EXPORT_SYMBOL(panic); * 'U' - Userspace-defined naughtiness. * 'A' - ACPI table overridden. * 'W' - Taint on warning. + * 'C' - modules from drivers/staging are loaded. * * The string is overwritten by the next call to print_taint(). */ @@ -163,7 +164,7 @@ const char *print_tainted(void) { static char buf[20]; if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", + snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c%c", tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', tainted & TAINT_FORCED_MODULE ? 'F' : ' ', tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', @@ -173,7 +174,8 @@ const char *print_tainted(void) tainted & TAINT_USER ? 'U' : ' ', tainted & TAINT_DIE ? 'D' : ' ', tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', - tainted & TAINT_WARN ? 'W' : ' '); + tainted & TAINT_WARN ? 'W' : ' ', + tainted & TAINT_CRAP ? 'C' : ' '); } else snprintf(buf, sizeof(buf), "Not tainted"); -- cgit v1.1 From 9c9f4ded90a59eee84e15f5fd38c03d60184e112 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 13 Oct 2008 10:37:26 +0100 Subject: tty: Add a kref count Introduce a kref to the tty structure and use it to protect the tty->signal tty references. For now we don't introduce it for anything else. Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 ++++- kernel/sys.c | 4 +--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7ce2ebe..30de644 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -802,6 +802,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->leader = 0; /* session leadership doesn't inherit */ sig->tty_old_pgrp = NULL; + sig->tty = NULL; sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; @@ -838,6 +839,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_signal(struct signal_struct *sig) { exit_thread_group_keys(sig); + tty_kref_put(sig->tty); kmem_cache_free(signal_cachep, sig); } @@ -1227,7 +1229,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; - p->signal->tty = current->signal->tty; + tty_kref_put(p->signal->tty); + p->signal->tty = tty_kref_get(current->signal->tty); set_task_pgrp(p, task_pgrp_nr(current)); set_task_session(p, task_session_nr(current)); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); diff --git a/kernel/sys.c b/kernel/sys.c index 038a7bc..234d945 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1060,9 +1060,7 @@ asmlinkage long sys_setsid(void) group_leader->signal->leader = 1; __set_special_pids(sid); - spin_lock(&group_leader->sighand->siglock); - group_leader->signal->tty = NULL; - spin_unlock(&group_leader->sighand->siglock); + proc_clear_tty(group_leader); err = session; out: -- cgit v1.1 From 95f9bfc6b76e862265a2d70ae061eec18fe14140 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 13 Oct 2008 10:39:23 +0100 Subject: tty: Move tty_write_message out of kernel/printk This is pure tty code so put it in the tty layer where it can be with the locking relevant material it uses Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- kernel/printk.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index b51b156..a430fd0 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1291,22 +1291,6 @@ static int __init disable_boot_consoles(void) } late_initcall(disable_boot_consoles); -/** - * tty_write_message - write a message to a certain tty, not just the console. - * @tty: the destination tty_struct - * @msg: the message to write - * - * This is used for messages that need to be redirected to a specific tty. - * We don't put it into the syslog queue right now maybe in the future if - * really needed. - */ -void tty_write_message(struct tty_struct *tty, char *msg) -{ - if (tty && tty->ops->write) - tty->ops->write(tty, msg, strlen(msg)); - return; -} - #if defined CONFIG_PRINTK /* -- cgit v1.1 From dbda4c0b97b18fd59b3964548361b4f92357f730 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 13 Oct 2008 10:40:53 +0100 Subject: tty: Fix abusers of current->sighand->tty Various people outside the tty layer still stick their noses in behind the scenes. We need to make sure they also obey the locking and referencing rules. Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- kernel/acct.c | 2 +- kernel/auditsc.c | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index dd68b90..f6006a6 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -548,7 +548,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, #endif spin_lock_irq(¤t->sighand->siglock); - tty = current->signal->tty; + tty = current->signal->tty; /* Safe as we hold the siglock */ ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 59cedfb..cf5bc2f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -246,8 +246,8 @@ static int audit_match_perm(struct audit_context *ctx, int mask) unsigned n; if (unlikely(!ctx)) return 0; - n = ctx->major; + switch (audit_classify_syscall(ctx->arch, n)) { case 0: /* native */ if ((mask & AUDIT_PERM_WRITE) && @@ -1204,13 +1204,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); - mutex_lock(&tty_mutex); - read_lock(&tasklist_lock); + spin_lock_irq(&tsk->sighand->siglock); if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) tty = tsk->signal->tty->name; else tty = "(none)"; - read_unlock(&tasklist_lock); + spin_unlock_irq(&tsk->sighand->siglock); + audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" " ppid=%d pid=%d auid=%u uid=%u gid=%u" @@ -1230,7 +1230,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->egid, context->sgid, context->fsgid, tty, tsk->sessionid); - mutex_unlock(&tty_mutex); audit_log_task_info(ab, tsk); if (context->filterkey) { -- cgit v1.1 From 118a9069f06ff591d51a3133e242f0c256ba2db7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 17 Oct 2008 02:38:36 -0500 Subject: module: remove CONFIG_KMOD in comment after #endif Signed-off-by: Rusty Russell --- kernel/kmod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 2456d1a..58f9c2e 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -113,7 +113,7 @@ int request_module(const char *fmt, ...) return ret; } EXPORT_SYMBOL(request_module); -#endif /* CONFIG_KMOD */ +#endif /* CONFIG_MODULES */ struct subprocess_info { struct work_struct work; -- cgit v1.1 From e94320939f44e0cbaccc3f259a5778abced4949c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 23 Sep 2008 23:51:11 +0400 Subject: modules: fix module "notes" kobject leak Fix "notes" kobject leak It happens every rmmod if KALLSYMS=y and SYSFS=y. # modprobe foo kobject: 'foo' (ffffffffa00743d0): kobject_add_internal: parent: 'module', set: 'module' kobject: 'holders' (ffff88017e7c5770): kobject_add_internal: parent: 'foo', set: '' kobject: 'foo' (ffffffffa00743d0): kobject_uevent_env kobject: 'foo' (ffffffffa00743d0): fill_kobj_path: path = '/module/foo' kobject: 'notes' (ffff88017fa9b668): kobject_add_internal: parent: 'foo', set: '' ^^^^^ # rmmod foo kobject: 'holders' (ffff88017e7c5770): kobject_cleanup kobject: 'holders' (ffff88017e7c5770): auto cleanup kobject_del kobject: 'holders' (ffff88017e7c5770): calling ktype release kobject: (ffff88017e7c5770): dynamic_kobj_release kobject: 'holders': free name kobject: 'foo' (ffffffffa00743d0): kobject_cleanup kobject: 'foo' (ffffffffa00743d0): does not have a release() function, it is broken and must be fixed. kobject: 'foo' (ffffffffa00743d0): auto cleanup 'remove' event kobject: 'foo' (ffffffffa00743d0): kobject_uevent_env kobject: 'foo' (ffffffffa00743d0): fill_kobj_path: path = '/module/foo' kobject: 'foo' (ffffffffa00743d0): auto cleanup kobject_del kobject: 'foo': free name [whooops] Signed-off-by: Alexey Dobriyan Cc: stable Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 9db1191..d5fcd24 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1173,7 +1173,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs, while (i-- > 0) sysfs_remove_bin_file(notes_attrs->dir, ¬es_attrs->attrs[i]); - kobject_del(notes_attrs->dir); + kobject_put(notes_attrs->dir); } kfree(notes_attrs); } -- cgit v1.1 From 346e15beb5343c2eb8216d820f2ed8f150822b08 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Tue, 12 Aug 2008 16:46:19 -0400 Subject: driver core: basic infrastructure for per-module dynamic debug messages Base infrastructure to enable per-module debug messages. I've introduced CONFIG_DYNAMIC_PRINTK_DEBUG, which when enabled centralizes control of debugging statements on a per-module basis in one /proc file, currently, /dynamic_printk/modules. When, CONFIG_DYNAMIC_PRINTK_DEBUG, is not set, debugging statements can still be enabled as before, often by defining 'DEBUG' for the proper compilation unit. Thus, this patch set has no affect when CONFIG_DYNAMIC_PRINTK_DEBUG is not set. The infrastructure currently ties into all pr_debug() and dev_dbg() calls. That is, if CONFIG_DYNAMIC_PRINTK_DEBUG is set, all pr_debug() and dev_dbg() calls can be dynamically enabled/disabled on a per-module basis. Future plans include extending this functionality to subsystems, that define their own debug levels and flags. Usage: Dynamic debugging is controlled by the debugfs file, /dynamic_printk/modules. This file contains a list of the modules that can be enabled. The format of the file is as follows: . . . : Name of the module in which the debug call resides : whether the messages are enabled or not For example: snd_hda_intel enabled=0 fixup enabled=1 driver enabled=0 Enable a module: $echo "set enabled=1 " > dynamic_printk/modules Disable a module: $echo "set enabled=0 " > dynamic_printk/modules Enable all modules: $echo "set enabled=1 all" > dynamic_printk/modules Disable all modules: $echo "set enabled=0 all" > dynamic_printk/modules Finally, passing "dynamic_printk" at the command line enables debugging for all modules. This mode can be turned off via the above disable command. [gkh: minor cleanups and tweaks to make the build work quietly] Signed-off-by: Jason Baron Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d5fcd24..c527006 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -784,6 +784,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) mutex_lock(&module_mutex); /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); + unregister_dynamic_debug_module(mod->name); free_module(mod); out: @@ -1783,6 +1784,33 @@ static inline void add_kallsyms(struct module *mod, } #endif /* CONFIG_KALLSYMS */ +#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG +static void dynamic_printk_setup(Elf_Shdr *sechdrs, unsigned int verboseindex) +{ + struct mod_debug *debug_info; + unsigned long pos, end; + unsigned int num_verbose; + + pos = sechdrs[verboseindex].sh_addr; + num_verbose = sechdrs[verboseindex].sh_size / + sizeof(struct mod_debug); + end = pos + (num_verbose * sizeof(struct mod_debug)); + + for (; pos < end; pos += sizeof(struct mod_debug)) { + debug_info = (struct mod_debug *)pos; + register_dynamic_debug_module(debug_info->modname, + debug_info->type, debug_info->logical_modname, + debug_info->flag_names, debug_info->hash, + debug_info->hash2); + } +} +#else +static inline void dynamic_printk_setup(Elf_Shdr *sechdrs, + unsigned int verboseindex) +{ +} +#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */ + static void *module_alloc_update_bounds(unsigned long size) { void *ret = module_alloc(size); @@ -1831,6 +1859,7 @@ static noinline struct module *load_module(void __user *umod, #endif unsigned int markersindex; unsigned int markersstringsindex; + unsigned int verboseindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -2117,6 +2146,7 @@ static noinline struct module *load_module(void __user *umod, markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); markersstringsindex = find_sec(hdr, sechdrs, secstrings, "__markers_strings"); + verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose"); /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { @@ -2167,6 +2197,7 @@ static noinline struct module *load_module(void __user *umod, marker_update_probe_range(mod->markers, mod->markers + mod->num_markers); #endif + dynamic_printk_setup(sechdrs, verboseindex); err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; -- cgit v1.1 From 9363b9f23c9cc36cc8ef6c05fdf879ee4a96ae92 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 15 Oct 2008 22:01:05 -0700 Subject: memrlimit: cgroup mm owner callback changes to add task info This patch adds an additional field to the mm_owner callbacks. This field is required to get to the mm that changed. Hold mmap_sem in write mode before calling the mm_owner_changed callback [hugh@veritas.com: fix mmap_sem deadlock] Signed-off-by: Balbir Singh Cc: Sudhir Kumar Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: Li Zefan Cc: Pavel Emelianov Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Vivek Goyal Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 4 +++- kernel/exit.c | 9 ++++----- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a0123d7..8c6e1c1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2735,6 +2735,8 @@ void cgroup_fork_callbacks(struct task_struct *child) * Called on every change to mm->owner. mm_init_owner() does not * invoke this routine, since it assigns the mm->owner the first time * and does not change it. + * + * The callbacks are invoked with mmap_sem held in read mode. */ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) { @@ -2750,7 +2752,7 @@ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) if (oldcgrp == newcgrp) continue; if (ss->mm_owner_changed) - ss->mm_owner_changed(ss, oldcgrp, newcgrp); + ss->mm_owner_changed(ss, oldcgrp, newcgrp, new); } } } diff --git a/kernel/exit.c b/kernel/exit.c index 85a83c8..0ef4673 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -640,24 +640,23 @@ retry: assign_new_owner: BUG_ON(c == p); get_task_struct(c); + read_unlock(&tasklist_lock); + down_write(&mm->mmap_sem); /* * The task_lock protects c->mm from changing. * We always want mm->owner->mm == mm */ task_lock(c); - /* - * Delay read_unlock() till we have the task_lock() - * to ensure that c does not slip away underneath us - */ - read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); + up_write(&mm->mmap_sem); put_task_struct(c); goto retry; } cgroup_mm_owner_callbacks(mm->owner, c); mm->owner = c; task_unlock(c); + up_write(&mm->mmap_sem); put_task_struct(c); } #endif /* CONFIG_MM_OWNER */ -- cgit v1.1 From 1bfcf1304ea79c46efc3724e548b13b4b442b418 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 15 Oct 2008 22:01:21 -0700 Subject: pm: rework disabling of user mode helpers during suspend/hibernation We currently use a PM notifier to disable user mode helpers before suspend and hibernation and to re-enable them during resume. However, this is not an ideal solution, because if any drivers want to upload firmware into memory before suspend, they have to use a PM notifier for this purpose and there is no guarantee that the ordering of PM notifiers will be as expected (ie. the notifier that disables user mode helpers has to be run after the driver's notifier used for uploading the firmware). For this reason, it seems better to move the disabling and enabling of user mode helpers to separate functions that will be called by the PM core as necessary. [akpm@linux-foundation.org: remove unneeded ifdefs] Signed-off-by: Rafael J. Wysocki Cc: Alan Stern Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 65 +++++++++++++++++++++++------------------------------ kernel/power/disk.c | 11 +++++++++ kernel/power/main.c | 7 ++++++ kernel/power/user.c | 10 ++++++++- 4 files changed, 55 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 2456d1a..ab7dd08 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -265,7 +265,7 @@ static void __call_usermodehelper(struct work_struct *work) } } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP /* * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY * (used for preventing user land processes from being created after the user @@ -288,39 +288,37 @@ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); */ #define RUNNING_HELPERS_TIMEOUT (5 * HZ) -static int usermodehelper_pm_callback(struct notifier_block *nfb, - unsigned long action, - void *ignored) +/** + * usermodehelper_disable - prevent new helpers from being started + */ +int usermodehelper_disable(void) { long retval; - switch (action) { - case PM_HIBERNATION_PREPARE: - case PM_SUSPEND_PREPARE: - usermodehelper_disabled = 1; - smp_mb(); - /* - * From now on call_usermodehelper_exec() won't start any new - * helpers, so it is sufficient if running_helpers turns out to - * be zero at one point (it may be increased later, but that - * doesn't matter). - */ - retval = wait_event_timeout(running_helpers_waitq, + usermodehelper_disabled = 1; + smp_mb(); + /* + * From now on call_usermodehelper_exec() won't start any new + * helpers, so it is sufficient if running_helpers turns out to + * be zero at one point (it may be increased later, but that + * doesn't matter). + */ + retval = wait_event_timeout(running_helpers_waitq, atomic_read(&running_helpers) == 0, RUNNING_HELPERS_TIMEOUT); - if (retval) { - return NOTIFY_OK; - } else { - usermodehelper_disabled = 0; - return NOTIFY_BAD; - } - case PM_POST_HIBERNATION: - case PM_POST_SUSPEND: - usermodehelper_disabled = 0; - return NOTIFY_OK; - } + if (retval) + return 0; - return NOTIFY_DONE; + usermodehelper_disabled = 0; + return -EAGAIN; +} + +/** + * usermodehelper_enable - allow new helpers to be started again + */ +void usermodehelper_enable(void) +{ + usermodehelper_disabled = 0; } static void helper_lock(void) @@ -334,18 +332,12 @@ static void helper_unlock(void) if (atomic_dec_and_test(&running_helpers)) wake_up(&running_helpers_waitq); } - -static void register_pm_notifier_callback(void) -{ - pm_notifier(usermodehelper_pm_callback, 0); -} -#else /* CONFIG_PM */ +#else /* CONFIG_PM_SLEEP */ #define usermodehelper_disabled 0 static inline void helper_lock(void) {} static inline void helper_unlock(void) {} -static inline void register_pm_notifier_callback(void) {} -#endif /* CONFIG_PM */ +#endif /* CONFIG_PM_SLEEP */ /** * call_usermodehelper_setup - prepare to call a usermode helper @@ -515,5 +507,4 @@ void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); BUG_ON(!khelper_wq); - register_pm_notifier_callback(); } diff --git a/kernel/power/disk.c b/kernel/power/disk.c index bbd85c6..331f983 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -520,6 +521,10 @@ int hibernate(void) if (error) goto Exit; + error = usermodehelper_disable(); + if (error) + goto Exit; + /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); if (error) @@ -558,6 +563,7 @@ int hibernate(void) thaw_processes(); Finish: free_basic_memory_bitmaps(); + usermodehelper_enable(); Exit: pm_notifier_call_chain(PM_POST_HIBERNATION); pm_restore_console(); @@ -634,6 +640,10 @@ static int software_resume(void) if (error) goto Finish; + error = usermodehelper_disable(); + if (error) + goto Finish; + error = create_basic_memory_bitmaps(); if (error) goto Finish; @@ -656,6 +666,7 @@ static int software_resume(void) thaw_processes(); Done: free_basic_memory_bitmaps(); + usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); pm_restore_console(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 540b16b..19122cf 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -237,6 +238,10 @@ static int suspend_prepare(void) if (error) goto Finish; + error = usermodehelper_disable(); + if (error) + goto Finish; + if (suspend_freeze_processes()) { error = -EAGAIN; goto Thaw; @@ -256,6 +261,7 @@ static int suspend_prepare(void) Thaw: suspend_thaw_processes(); + usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); @@ -376,6 +382,7 @@ int suspend_devices_and_enter(suspend_state_t state) static void suspend_finish(void) { suspend_thaw_processes(); + usermodehelper_enable(); pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); } diff --git a/kernel/power/user.c b/kernel/power/user.c index a6332a3..005b93d 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -212,13 +212,20 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, case SNAPSHOT_FREEZE: if (data->frozen) break; + printk("Syncing filesystems ... "); sys_sync(); printk("done.\n"); - error = freeze_processes(); + error = usermodehelper_disable(); if (error) + break; + + error = freeze_processes(); + if (error) { thaw_processes(); + usermodehelper_enable(); + } if (!error) data->frozen = 1; break; @@ -227,6 +234,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (!data->frozen || data->ready) break; thaw_processes(); + usermodehelper_enable(); data->frozen = 0; break; -- cgit v1.1 From d9f3216b474be170d0c093d70125b541ace58704 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 15 Oct 2008 22:01:26 -0700 Subject: kernel/dma.c: remove a CVS keyword Remove a CVS keyword that wasn't updated for a long time from a comment. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma.c b/kernel/dma.c index d2c60a8..f903189 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -1,4 +1,4 @@ -/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $ +/* * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. * * Written by Hennus Bergman, 1992. -- cgit v1.1 From a25d644fc0e232f242d1f3baa63c149c42536ff0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 Oct 2008 22:01:38 -0700 Subject: wait: kill is_sync_wait() is_sync_wait() is used to distinguish between sync and async waits. Basically sync waits are the ones initialized with init_waitqueue_entry() and async ones with init_waitqueue_func_entry(). The sync/async distinction is used only in prepare_to_wait[_exclusive]() and its only function is to skip setting the current task state if the wait is async. This has a few problems. * No one uses it. None of func_entry users use prepare_to_wait() functions, so the code path never gets executed. * The distinction is bogus. Maybe back when func_entry is used only by aio but it's now also used by epoll and in future possibly by 9p and poll/select. * Taking @state as argument and ignoring it silenly depending on how @wait is initialized is just a bad error-prone API. * It prevents func_entry waits from using wait->private for no good reason. This patch kills is_sync_wait() and the associated code paths from prepare_to_wait[_exclusive](). As there was no user of these code paths, this patch doesn't cause any behavior difference. Signed-off-by: Tejun Heo Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/wait.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/wait.c b/kernel/wait.c index c275c56..cd87131 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -72,12 +72,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) __add_wait_queue(q, wait); - /* - * don't alter the task state if this is just going to - * queue an async wait queue callback - */ - if (is_sync_wait(wait)) - set_current_state(state); + set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait); @@ -91,12 +86,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) __add_wait_queue_tail(q, wait); - /* - * don't alter the task state if this is just going to - * queue an async wait queue callback - */ - if (is_sync_wait(wait)) - set_current_state(state); + set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait_exclusive); -- cgit v1.1 From 9ba16087d9f996a93ab6f4453a52a4b24bc1f25c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 15 Oct 2008 22:01:38 -0700 Subject: Kconfig: eliminate "def_bool n" constructs Using "def_bool n" is pointless, simply using bool here appears more appropriate. Further, retaining such options that don't have a prompt and aren't selected by anything seems also at least questionable. Signed-off-by: Jan Beulich Cc: Ingo Molnar Cc: Tony Luck Cc: Thomas Gleixner Cc: Bartlomiej Zolnierkiewicz Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 8d53106..95ed429 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -3,7 +3,6 @@ # config TICK_ONESHOT bool - default n config NO_HZ bool "Tickless System (Dynamic Ticks)" -- cgit v1.1 From 25ddbb18aae33ad255eb9f35aacebe3af01e1e9c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 15 Oct 2008 22:01:41 -0700 Subject: Make the taint flags reliable It's somewhat unlikely that it happens, but right now a race window between interrupts or machine checks or oopses could corrupt the tainted bitmap because it is modified in a non atomic fashion. Convert the taint variable to an unsigned long and use only atomic bit operations on it. Unfortunately this means the intvec sysctl functions cannot be used on it anymore. It turned out the taint sysctl handler could actually be simplified a bit (since it only increases capabilities) so this patch actually removes code. [akpm@linux-foundation.org: remove unneeded include] Signed-off-by: Andi Kleen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 12 +++++----- kernel/panic.c | 63 +++++++++++++++++++++++++++++++++++-------------- kernel/softlockup.c | 2 +- kernel/sysctl.c | 67 ++++++++++++++++++++++------------------------------- 4 files changed, 81 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 9db1191..dd9ac6a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -100,7 +100,7 @@ static inline int strong_try_module_get(struct module *mod) static inline void add_taint_module(struct module *mod, unsigned flag) { add_taint(flag); - mod->taints |= flag; + mod->taints |= (1U << flag); } /* @@ -923,7 +923,7 @@ static const char vermagic[] = VERMAGIC_STRING; static int try_to_force_load(struct module *mod, const char *symname) { #ifdef CONFIG_MODULE_FORCE_LOAD - if (!(tainted & TAINT_FORCED_MODULE)) + if (!test_taint(TAINT_FORCED_MODULE)) printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); add_taint_module(mod, TAINT_FORCED_MODULE); @@ -1033,7 +1033,7 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, const unsigned long *crc; ret = find_symbol(name, &owner, &crc, - !(mod->taints & TAINT_PROPRIETARY_MODULE), true); + !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); if (!IS_ERR_VALUE(ret)) { /* use_module can fail due to OOM, or module initialization or unloading */ @@ -1634,7 +1634,7 @@ static void set_license(struct module *mod, const char *license) license = "unspecified"; if (!license_is_gpl_compatible(license)) { - if (!(tainted & TAINT_PROPRIETARY_MODULE)) + if (!test_taint(TAINT_PROPRIETARY_MODULE)) printk(KERN_WARNING "%s: module license '%s' taints " "kernel.\n", mod->name, license); add_taint_module(mod, TAINT_PROPRIETARY_MODULE); @@ -2552,9 +2552,9 @@ static char *module_flags(struct module *mod, char *buf) mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { buf[bx++] = '('; - if (mod->taints & TAINT_PROPRIETARY_MODULE) + if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) buf[bx++] = 'P'; - if (mod->taints & TAINT_FORCED_MODULE) + if (mod->taints & (1 << TAINT_FORCED_MODULE)) buf[bx++] = 'F'; /* * TAINT_FORCED_RMMOD: could be added. diff --git a/kernel/panic.c b/kernel/panic.c index 12c5a0a..028013f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,7 +23,7 @@ #include int panic_on_oops; -int tainted; +static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); @@ -159,31 +159,60 @@ EXPORT_SYMBOL(panic); * The string is overwritten by the next call to print_taint(). */ +struct tnt { + u8 bit; + char true; + char false; +}; + +static const struct tnt tnts[] = { + { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, + { TAINT_FORCED_MODULE, 'F', ' ' }, + { TAINT_UNSAFE_SMP, 'S', ' ' }, + { TAINT_FORCED_RMMOD, 'R', ' ' }, + { TAINT_MACHINE_CHECK, 'M', ' ' }, + { TAINT_BAD_PAGE, 'B', ' ' }, + { TAINT_USER, 'U', ' ' }, + { TAINT_DIE, 'D', ' ' }, + { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, + { TAINT_WARN, 'W', ' ' }, +}; + const char *print_tainted(void) { - static char buf[20]; - if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", - tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', - tainted & TAINT_FORCED_MODULE ? 'F' : ' ', - tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', - tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', - tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', - tainted & TAINT_BAD_PAGE ? 'B' : ' ', - tainted & TAINT_USER ? 'U' : ' ', - tainted & TAINT_DIE ? 'D' : ' ', - tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', - tainted & TAINT_WARN ? 'W' : ' '); - } - else + static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; + + if (tainted_mask) { + char *s; + int i; + + s = buf + sprintf(buf, "Tainted: "); + for (i = 0; i < ARRAY_SIZE(tnts); i++) { + const struct tnt *t = &tnts[i]; + *s++ = test_bit(t->bit, &tainted_mask) ? + t->true : t->false; + } + *s = 0; + } else snprintf(buf, sizeof(buf), "Not tainted"); return(buf); } +int test_taint(unsigned flag) +{ + return test_bit(flag, &tainted_mask); +} +EXPORT_SYMBOL(test_taint); + +unsigned long get_taint(void) +{ + return tainted_mask; +} + void add_taint(unsigned flag) { debug_locks = 0; /* can't trust the integrity of the kernel anymore */ - tainted |= flag; + set_bit(flag, &tainted_mask); } EXPORT_SYMBOL(add_taint); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index cb838ee..3953e4a 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -226,7 +226,7 @@ static void check_hung_uninterruptible_tasks(int this_cpu) * If the system crashed already then all bets are off, * do not report extra hung tasks: */ - if ((tainted & TAINT_DIE) || did_panic) + if (test_taint(TAINT_DIE) || did_panic) return; read_lock(&tasklist_lock); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cfc5295..ec88fcc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -149,7 +149,7 @@ extern int max_lock_depth; #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -379,10 +379,9 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", - .data = &tainted, - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = &proc_dointvec_taint, + .proc_handler = &proc_taint, }, #endif #ifdef CONFIG_LATENCYTOP @@ -2228,49 +2227,39 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, NULL,NULL); } -#define OP_SET 0 -#define OP_AND 1 -#define OP_OR 2 - -static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - int op = *(int *)data; - if (write) { - int val = *negp ? -*lvalp : *lvalp; - switch(op) { - case OP_SET: *valp = val; break; - case OP_AND: *valp &= val; break; - case OP_OR: *valp |= val; break; - } - } else { - int val = *valp; - if (val < 0) { - *negp = -1; - *lvalp = (unsigned long)-val; - } else { - *negp = 0; - *lvalp = (unsigned long)val; - } - } - return 0; -} - /* - * Taint values can only be increased + * Taint values can only be increased + * This means we can safely use a temporary. */ -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int op; + struct ctl_table t; + unsigned long tmptaint = get_taint(); + int err; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; - op = OP_OR; - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, - do_proc_dointvec_bset_conv,&op); + t = *table; + t.data = &tmptaint; + err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); + if (err < 0) + return err; + + if (write) { + /* + * Poor man's atomic or. Not worth adding a primitive + * to everyone's atomic.h for this + */ + int i; + for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { + if ((tmptaint >> i) & 1) + add_taint(i); + } + } + + return err; } struct do_proc_dointvec_minmax_conv_param { -- cgit v1.1 From 0c2d64fb6cae9aae480f6a46cfe79f8d7d48b59f Mon Sep 17 00:00:00 2001 From: Adam Tkac Date: Wed, 15 Oct 2008 22:01:45 -0700 Subject: rlimit: permit setting RLIMIT_NOFILE to RLIM_INFINITY When a process wants to set the limit of open files to RLIM_INFINITY it gets EPERM even if it has CAP_SYS_RESOURCE capability. For example, BIND does: ... #elif defined(NR_OPEN) && defined(__linux__) /* * Some Linux kernels don't accept RLIM_INFINIT; the maximum * possible value is the NR_OPEN defined in linux/fs.h. */ if (resource == isc_resource_openfiles && rlim_value == RLIM_INFINITY) { rl.rlim_cur = rl.rlim_max = NR_OPEN; unixresult = setrlimit(unixresource, &rl); if (unixresult == 0) return (ISC_R_SUCCESS); } #elif ... If we allow setting RLIMIT_NOFILE to RLIM_INFINITY we increase portability - you don't have to check if OS is linux and then use different schema for limits. The spec says "Specifying RLIM_INFINITY as any resource limit value on a successful call to setrlimit() shall inhibit enforcement of that resource limit." and we're presently not doing that. Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 234d945..d5b79f6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1450,14 +1450,22 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) return -EINVAL; if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) return -EFAULT; - if (new_rlim.rlim_cur > new_rlim.rlim_max) - return -EINVAL; old_rlim = current->signal->rlim + resource; if ((new_rlim.rlim_max > old_rlim->rlim_max) && !capable(CAP_SYS_RESOURCE)) return -EPERM; - if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) - return -EPERM; + + if (resource == RLIMIT_NOFILE) { + if (new_rlim.rlim_max == RLIM_INFINITY) + new_rlim.rlim_max = sysctl_nr_open; + if (new_rlim.rlim_cur == RLIM_INFINITY) + new_rlim.rlim_cur = sysctl_nr_open; + if (new_rlim.rlim_max > sysctl_nr_open) + return -EPERM; + } + + if (new_rlim.rlim_cur > new_rlim.rlim_max) + return -EINVAL; retval = security_task_setrlimit(resource, &new_rlim); if (retval) -- cgit v1.1 From 22b8ce94708f7cdf0b04965c6f7443dfd374c35c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 15 Oct 2008 22:01:46 -0700 Subject: profiling: dynamically enable readprofile at runtime Way too often, I have a machine that exhibits some kind of crappy behavior. The CPU looks wedged in the kernel or it is spending way too much system time and I wonder what is responsible. I try to run readprofile. But, of course, Ubuntu doesn't enable it by default. Dang! The reason we boot-time enable it is that it takes a big bufffer that we generally can only bootmem alloc. But, does it hurt to at least try and runtime-alloc it? To use: echo 2 > /sys/kernel/profile Then run readprofile like normal. This should fix the compile issue with allmodconfig. I've compile-tested on a bunch more configs now including a few more architectures. Signed-off-by: Dave Hansen Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ksysfs.c | 35 +++++++++++++++++++++++++++++++++++ kernel/profile.c | 41 +++++++++++++++++++++++++++++++---------- 2 files changed, 66 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e53bc30..08dd8ed 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #define KERNEL_ATTR_RO(_name) \ @@ -53,6 +54,37 @@ static ssize_t uevent_helper_store(struct kobject *kobj, KERNEL_ATTR_RW(uevent_helper); #endif +#ifdef CONFIG_PROFILING +static ssize_t profiling_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", prof_on); +} +static ssize_t profiling_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int ret; + + if (prof_on) + return -EEXIST; + /* + * This eventually calls into get_option() which + * has a ton of callers and is not const. It is + * easiest to cast it away here. + */ + profile_setup((char *)buf); + ret = profile_init(); + if (ret) + return ret; + ret = create_proc_profile(); + if (ret) + return ret; + return count; +} +KERNEL_ATTR_RW(profiling); +#endif + #ifdef CONFIG_KEXEC static ssize_t kexec_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -109,6 +141,9 @@ static struct attribute * kernel_attrs[] = { &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, #endif +#ifdef CONFIG_PROFILING + &profiling_attr.attr, +#endif #ifdef CONFIG_KEXEC &kexec_loaded_attr.attr, &kexec_crash_loaded_attr.attr, diff --git a/kernel/profile.c b/kernel/profile.c index cd26bed..a9e422d 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include #include @@ -50,11 +52,11 @@ static DEFINE_PER_CPU(int, cpu_profile_flip); static DEFINE_MUTEX(profile_flip_mutex); #endif /* CONFIG_SMP */ -static int __init profile_setup(char *str) +int profile_setup(char *str) { - static char __initdata schedstr[] = "schedule"; - static char __initdata sleepstr[] = "sleep"; - static char __initdata kvmstr[] = "kvm"; + static char schedstr[] = "schedule"; + static char sleepstr[] = "sleep"; + static char kvmstr[] = "kvm"; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { @@ -100,14 +102,33 @@ static int __init profile_setup(char *str) __setup("profile=", profile_setup); -void __init profile_init(void) +int profile_init(void) { + int buffer_bytes; if (!prof_on) - return; + return 0; /* only text is profiled */ prof_len = (_etext - _stext) >> prof_shift; - prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); + buffer_bytes = prof_len*sizeof(atomic_t); + if (!slab_is_available()) { + prof_buffer = alloc_bootmem(buffer_bytes); + return 0; + } + + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); + if (prof_buffer) + return 0; + + prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); + if (prof_buffer) + return 0; + + prof_buffer = vmalloc(buffer_bytes); + if (prof_buffer) + return 0; + + return -ENOMEM; } /* Profile event notifications */ @@ -527,7 +548,7 @@ static void __init profile_nop(void *unused) { } -static int __init create_hash_tables(void) +static int create_hash_tables(void) { int cpu; @@ -575,14 +596,14 @@ out_cleanup: #define create_hash_tables() ({ 0; }) #endif -static int __init create_proc_profile(void) +int create_proc_profile(void) { struct proc_dir_entry *entry; if (!prof_on) return 0; if (create_hash_tables()) - return -1; + return -ENOMEM; entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &proc_profile_operations); if (!entry) -- cgit v1.1 From 87988815073918134c0dae059cf247a4472d78ed Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Wed, 15 Oct 2008 22:01:51 -0700 Subject: utsname: completely overwrite prior information On sethostname() and setdomainname(), previous information may be retained if it was longer than than the new hostname/domainname. This can be demonstrated trivially by calling sethostname() first with a long name, then with a short name, and then calling uname() to retrieve the full buffer that contains the hostname (and possibly parts of the old hostname), one just has to look past the terminating zero. I don't know if we should really care that much (hence the RFC); the only scenarios I can possibly think of is administrator putting something sensitive in the hostname (or domain name) by accident, and changing it back will not undo the mistake entirely, though it's not like we can recover gracefully from "rm -rf /" either... The other scenario is namespaces (CLONE_NEWUTS) where some information may be unintentionally "inherited" from the previous namespace (a program wants to hide the original name and does clone + sethostname, but some information is still left). I think the patch may be defended on grounds of the principle of least surprise. But I am not adamant :-) (I guess the question now is whether userspace should be able to write embedded NULs into the buffer or not...) At least the observation has been made and the patch has been presented. Signed-off-by: Vegard Nossum Cc: "Eric W. Biederman" Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index d5b79f6..558b035 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1350,7 +1350,8 @@ asmlinkage long sys_sethostname(char __user *name, int len) errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { memcpy(utsname()->nodename, tmp, len); - utsname()->nodename[len] = 0; + memset(utsname()->nodename + len, 0, + sizeof(utsname()->nodename) - len); errno = 0; } up_write(&uts_sem); @@ -1396,7 +1397,8 @@ asmlinkage long sys_setdomainname(char __user *name, int len) errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { memcpy(utsname()->domainname, tmp, len); - utsname()->domainname[len] = 0; + memset(utsname()->domainname + len, 0, + sizeof(utsname()->domainname) - len); errno = 0; } up_write(&uts_sem); -- cgit v1.1 From 9679e4dd628743b9ef4375d60ae69923c3766173 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 15 Oct 2008 22:01:51 -0700 Subject: kernel/sys.c: improve code generation utsname() is quite expensive to calculate. Cache it in a local. text data bss dec hex filename before: 11136 720 16 11872 2e60 kernel/sys.o after: 11096 720 16 11832 2e38 kernel/sys.o Acked-by: Vegard Nossum Cc: "Eric W. Biederman" Acked-by: "Serge E. Hallyn" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 558b035..0bc8fa3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1349,9 +1349,10 @@ asmlinkage long sys_sethostname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(utsname()->nodename, tmp, len); - memset(utsname()->nodename + len, 0, - sizeof(utsname()->nodename) - len); + struct new_utsname *u = utsname(); + + memcpy(u->nodename, tmp, len); + memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; } up_write(&uts_sem); @@ -1363,15 +1364,17 @@ asmlinkage long sys_sethostname(char __user *name, int len) asmlinkage long sys_gethostname(char __user *name, int len) { int i, errno; + struct new_utsname *u; if (len < 0) return -EINVAL; down_read(&uts_sem); - i = 1 + strlen(utsname()->nodename); + u = utsname(); + i = 1 + strlen(u->nodename); if (i > len) i = len; errno = 0; - if (copy_to_user(name, utsname()->nodename, i)) + if (copy_to_user(name, u->nodename, i)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1396,9 +1399,10 @@ asmlinkage long sys_setdomainname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(utsname()->domainname, tmp, len); - memset(utsname()->domainname + len, 0, - sizeof(utsname()->domainname) - len); + struct new_utsname *u = utsname(); + + memcpy(u->domainname, tmp, len); + memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; } up_write(&uts_sem); -- cgit v1.1 From 7968b3d9a673e922ab980b2616945e63a52d88fe Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Wed, 15 Oct 2008 22:01:57 -0700 Subject: kernel/kallsyms.c: fix double return Commit 6dd06c9fbe025f542bce4cdb91790c0f91962722 ("module: make module_address_lookup safe") introduced double returns in the function kallsyms_lookup(), it's weird. The second one should be removed. Signed-off-by: WANG Cong Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 38fc10a..5072cf1 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -260,7 +260,6 @@ const char *kallsyms_lookup(unsigned long addr, /* see if it's in a module */ return module_address_lookup(addr, symbolsize, offset, modname, namebuf); - return NULL; } int lookup_symbol_name(unsigned long addr, char *symname) -- cgit v1.1 From e1f8e87449147ffe5ea3de64a46af7de450ce279 Mon Sep 17 00:00:00 2001 From: Francois Cami Date: Wed, 15 Oct 2008 22:01:59 -0700 Subject: Remove Andrew Morton's old email accounts People can use the real name an an index into MAINTAINERS to find the current email address. Signed-off-by: Francois Cami Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- kernel/workqueue.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index a430fd0..c3ab51d 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -13,7 +13,7 @@ * Fixed SMP synchronization, 08/08/99, Manfred Spraul * manfred@colorfullife.com * Rewrote bits to get rid of console_lock - * 01Mar01 Andrew Morton + * 01Mar01 Andrew Morton */ #include diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4048e92..714afad 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -9,7 +9,7 @@ * Derived from the taskqueue/keventd code by: * * David Woodhouse - * Andrew Morton + * Andrew Morton * Kai Petzke * Theodore Ts'o * -- cgit v1.1 From 6d5cd6effed5f8c958a6c0df56da336f5a4fdb8a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Oct 2008 22:02:00 -0700 Subject: taint: fix kernel-doc Move print_tainted() kernel-doc to avoid the following error: Error(/var/linsrc/mmotm-2008-1002-1617//kernel/panic.c:155): cannot understand prototype: 'struct tnt ' Signed-off-by: Randy Dunlap Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 028013f..f290e8e 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -143,21 +143,6 @@ NORET_TYPE void panic(const char * fmt, ...) EXPORT_SYMBOL(panic); -/** - * print_tainted - return a string to represent the kernel taint state. - * - * 'P' - Proprietary module has been loaded. - * 'F' - Module has been forcibly loaded. - * 'S' - SMP with CPUs not designed for SMP. - * 'R' - User forced a module unload. - * 'M' - System experienced a machine check exception. - * 'B' - System has hit bad_page. - * 'U' - Userspace-defined naughtiness. - * 'A' - ACPI table overridden. - * 'W' - Taint on warning. - * - * The string is overwritten by the next call to print_taint(). - */ struct tnt { u8 bit; @@ -178,6 +163,21 @@ static const struct tnt tnts[] = { { TAINT_WARN, 'W', ' ' }, }; +/** + * print_tainted - return a string to represent the kernel taint state. + * + * 'P' - Proprietary module has been loaded. + * 'F' - Module has been forcibly loaded. + * 'S' - SMP with CPUs not designed for SMP. + * 'R' - User forced a module unload. + * 'M' - System experienced a machine check exception. + * 'B' - System has hit bad_page. + * 'U' - Userspace-defined naughtiness. + * 'A' - ACPI table overridden. + * 'W' - Taint on warning. + * + * The string is overwritten by the next call to print_taint(). + */ const char *print_tainted(void) { static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; -- cgit v1.1 From 20036fdcaf05fac0a84ed81a56906493a7d822e2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 15 Oct 2008 22:02:02 -0700 Subject: Add kerneldoc documentation for new printk format extensions Add documentation in kerneldoc for new printk format extensions This patch documents the new %pS/%pF options in printk in kernel doc. Hope I didn't miss any other extension. Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index c3ab51d..fbd94bd 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -593,6 +593,8 @@ static int have_callable_console(void) * * See also: * printf(3) + * + * See the vsnprintf() documentation for format string extensions over C99. */ asmlinkage int printk(const char *fmt, ...) -- cgit v1.1 From b418da16dd44810e5d5a22bba377cca80512a524 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Oct 2008 22:02:06 -0700 Subject: compat: generic compat get/settimeofday Nothing arch specific in get/settimeofday. The details of the timeval conversion varied a little from arch to arch, but all with the same results. Also add an extern declaration for sys_tz to linux/time.h because externs in .c files are fowned upon. I'll kill the externs in various other files in a sparate patch. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Christoph Hellwig Acked-by: David S. Miller [ sparc bits ] Cc: "Luck, Tony" Cc: Ralf Baechle Acked-by: Kyle McMartin Cc: Matthew Wilcox Cc: Grant Grundler Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 32c254a..143990e 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -26,6 +26,64 @@ #include +/* + * Note that the native side is already converted to a timespec, because + * that's what we want anyway. + */ +static int compat_get_timeval(struct timespec *o, + struct compat_timeval __user *i) +{ + long usec; + + if (get_user(o->tv_sec, &i->tv_sec) || + get_user(usec, &i->tv_usec)) + return -EFAULT; + o->tv_nsec = usec * 1000; + return 0; +} + +static int compat_put_timeval(struct compat_timeval __user *o, + struct timeval *i) +{ + return (put_user(i->tv_sec, &o->tv_sec) || + put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; +} + +asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) +{ + if (tv) { + struct timeval ktv; + do_gettimeofday(&ktv); + if (compat_put_timeval(tv, &ktv)) + return -EFAULT; + } + if (tz) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + + return 0; +} + +asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) +{ + struct timespec kts; + struct timezone ktz; + + if (tv) { + if (compat_get_timeval(&kts, tv)) + return -EFAULT; + } + if (tz) { + if (copy_from_user(&ktz, tz, sizeof(ktz))) + return -EFAULT; + } + + return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); +} + int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || -- cgit v1.1 From f221e726bf4e082a05dcd573379ac859bfba7126 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 15 Oct 2008 22:04:23 -0700 Subject: sysctl: simplify ->strategy name and nlen parameters passed to ->strategy hook are unused, remove them. In general ->strategy hook should know what it's doing, and don't do something tricky for which, say, pointer to original userspace array may be needed (name). Signed-off-by: Alexey Dobriyan Acked-by: David S. Miller [ networking bits ] Cc: Ralf Baechle Cc: David Howells Cc: Matt Mackall Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 29 +++++++++++++---------------- kernel/utsname_sysctl.c | 5 ++--- 2 files changed, 15 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ec88fcc..9792c31 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1500,7 +1500,6 @@ void register_sysctl_root(struct ctl_table_root *root) /* Perform the actual read/write of a sysctl table entry. */ static int do_sysctl_strategy(struct ctl_table_root *root, struct ctl_table *table, - int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -1514,8 +1513,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root, return -EPERM; if (table->strategy) { - rc = table->strategy(table, name, nlen, oldval, oldlenp, - newval, newlen); + rc = table->strategy(table, oldval, oldlenp, newval, newlen); if (rc < 0) return rc; if (rc > 0) @@ -1525,8 +1523,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root, /* If there is no strategy routine, or if the strategy returns * zero, proceed with automatic r/w */ if (table->data && table->maxlen) { - rc = sysctl_data(table, name, nlen, oldval, oldlenp, - newval, newlen); + rc = sysctl_data(table, oldval, oldlenp, newval, newlen); if (rc < 0) return rc; } @@ -1558,7 +1555,7 @@ repeat: table = table->child; goto repeat; } - error = do_sysctl_strategy(root, table, name, nlen, + error = do_sysctl_strategy(root, table, oldval, oldlenp, newval, newlen); return error; @@ -2707,7 +2704,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, */ /* The generic sysctl data routine (used if no strategy routine supplied) */ -int sysctl_data(struct ctl_table *table, int __user *name, int nlen, +int sysctl_data(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2741,7 +2738,7 @@ int sysctl_data(struct ctl_table *table, int __user *name, int nlen, } /* The generic string strategy routine: */ -int sysctl_string(struct ctl_table *table, int __user *name, int nlen, +int sysctl_string(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2787,7 +2784,7 @@ int sysctl_string(struct ctl_table *table, int __user *name, int nlen, * are between the minimum and maximum values given in the arrays * table->extra1 and table->extra2, respectively. */ -int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, +int sysctl_intvec(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2823,7 +2820,7 @@ int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, } /* Strategy function to convert jiffies to seconds */ -int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2857,7 +2854,7 @@ int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, } /* Strategy function to convert jiffies to seconds */ -int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_ms_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2912,35 +2909,35 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args) return error; } -int sysctl_data(struct ctl_table *table, int __user *name, int nlen, +int sysctl_data(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_string(struct ctl_table *table, int __user *name, int nlen, +int sysctl_string(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, +int sysctl_intvec(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_ms_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4ab9659..3b34b35 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -60,7 +60,7 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, #ifdef CONFIG_SYSCTL_SYSCALL /* The generic string strategy routine: */ -static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, +static int sysctl_uts_string(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -69,8 +69,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, write = newval && newlen; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = sysctl_string(&uts_table, name, nlen, - oldval, oldlenp, newval, newlen); + r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen); put_uts(table, write, uts_table.data); return r; } -- cgit v1.1 From ebf3f09c634906d371f2bfd71b41c7e0c52efe7e Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Wed, 15 Oct 2008 22:05:12 -0700 Subject: Configure out AIO support This patchs adds the CONFIG_AIO option which allows to remove support for asynchronous I/O operations, that are not necessarly used by applications, particularly on embedded devices. As this is a size-reduction option, it depends on CONFIG_EMBEDDED. It allows to save ~7 kilobytes of kernel code/data: text data bss dec hex filename 1115067 119180 217088 1451335 162547 vmlinux 1108025 119048 217088 1444161 160941 vmlinux.new -7042 -132 0 -7174 -1C06 +/- This patch has been originally written by Matt Mackall , and is part of the Linux Tiny project. [randy.dunlap@oracle.com: build fix] Signed-off-by: Thomas Petazzoni Cc: Benjamin LaHaise Cc: Zach Brown Signed-off-by: Matt Mackall Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 5 +++++ kernel/sysctl.c | 2 ++ 2 files changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 503d8d4..a77b27b 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -126,6 +126,11 @@ cond_syscall(sys_vm86); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); cond_syscall(sys_flock); +cond_syscall(sys_io_setup); +cond_syscall(sys_io_destroy); +cond_syscall(sys_io_submit); +cond_syscall(sys_io_cancel); +cond_syscall(sys_io_getevents); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9792c31..617d41e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1281,6 +1281,7 @@ static struct ctl_table fs_table[] = { .extra2 = &two, }, #endif +#ifdef CONFIG_AIO { .procname = "aio-nr", .data = &aio_nr, @@ -1295,6 +1296,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_doulongvec_minmax, }, +#endif /* CONFIG_AIO */ #ifdef CONFIG_INOTIFY_USER { .ctl_name = FS_INOTIFY, -- cgit v1.1 From c26ec88ea86ad5122a6ea5ad635e6a1f6c395d74 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 15 Oct 2008 22:05:14 -0700 Subject: resources: tidy __request_region() No functional change. Just return NULL for kzalloc failure immediately, rather than wrapping the whole function body in the body of an "if". Signed-off-by: Bjorn Helgaas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 414d6fc..f193d6e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -630,33 +630,34 @@ struct resource * __request_region(struct resource *parent, { struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); - if (res) { - res->name = name; - res->start = start; - res->end = start + n - 1; - res->flags = IORESOURCE_BUSY; + if (!res) + return NULL; - write_lock(&resource_lock); + res->name = name; + res->start = start; + res->end = start + n - 1; + res->flags = IORESOURCE_BUSY; - for (;;) { - struct resource *conflict; + write_lock(&resource_lock); - conflict = __request_resource(parent, res); - if (!conflict) - break; - if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) - continue; - } + for (;;) { + struct resource *conflict; - /* Uhhuh, that didn't work out.. */ - kfree(res); - res = NULL; + conflict = __request_resource(parent, res); + if (!conflict) break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; } - write_unlock(&resource_lock); + + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; } + write_unlock(&resource_lock); return res; } EXPORT_SYMBOL(__request_region); -- cgit v1.1 From 2b252c541158cfed9d7e5b019b894fe2174f5908 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 15 Oct 2008 22:05:20 -0700 Subject: make kprobes.c:kretprobe_table_lock() static Make the needlessly global kretprobe_table_lock() static. Signed-off-by: Adrian Bunk Acked-by: Ananth N Mavinakayanahalli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 75bc2cd..8b57a25 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -404,7 +404,7 @@ void kretprobe_hash_lock(struct task_struct *tsk, spin_lock_irqsave(hlist_lock, *flags); } -void kretprobe_table_lock(unsigned long hash, unsigned long *flags) +static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_lock_irqsave(hlist_lock, *flags); -- cgit v1.1 From 1c95e1b69073cff5ff179e592fa1a1e182c78a17 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 16 Oct 2008 15:32:46 -0700 Subject: Fix kernel/softirq.c printk format warning properly This fixes the broken 77af7e3403e7314c47b0c07fbc5e4ef21d939532 ("softirq, warning fix: correct a format to avoid a warning") fix correctly. The type of a pointer subtraction is not "int", nor is it "long". It can be either (or something else). It's "ptrdiff_t", and the printk format for it is "%td". Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index be7a829..37d67aa 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -210,7 +210,7 @@ restart: h->action(h); if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered softirq %d %p" + printk(KERN_ERR "huh, entered softirq %td %p" "with preempt_count %08x," " exited with %08x?\n", h - softirq_vec, h->action, prev_count, preempt_count()); -- cgit v1.1 From 54514a70adefe356afe854e2d3912d46668068e6 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 23 Sep 2008 22:15:57 -0700 Subject: softirq: Add support for triggering softirq work on softirqs. This is basically a genericization of Jens Axboe's block layer remote softirq changes. Signed-off-by: David S. Miller Signed-off-by: Jens Axboe --- kernel/softirq.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 37d67aa..83ba21a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -6,6 +6,8 @@ * Distribute under GPLv2. * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) + * + * Remote softirq infrastructure is by Jens Axboe. */ #include @@ -474,17 +476,144 @@ void tasklet_kill(struct tasklet_struct *t) EXPORT_SYMBOL(tasklet_kill); +DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); +EXPORT_PER_CPU_SYMBOL(softirq_work_list); + +static void __local_trigger(struct call_single_data *cp, int softirq) +{ + struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); + + list_add_tail(&cp->list, head); + + /* Trigger the softirq only if the list was previously empty. */ + if (head->next == &cp->list) + raise_softirq_irqoff(softirq); +} + +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS +static void remote_softirq_receive(void *data) +{ + struct call_single_data *cp = data; + unsigned long flags; + int softirq; + + softirq = cp->priv; + + local_irq_save(flags); + __local_trigger(cp, softirq); + local_irq_restore(flags); +} + +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + if (cpu_online(cpu)) { + cp->func = remote_softirq_receive; + cp->info = cp; + cp->flags = 0; + cp->priv = softirq; + + __smp_call_function_single(cpu, cp); + return 0; + } + return 1; +} +#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + return 1; +} +#endif + +/** + * __send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @this_cpu: the currently executing cpu + * @softirq: the softirq for the work + * + * Attempt to schedule softirq work on a remote cpu. If this cannot be + * done, the work is instead queued up on the local cpu. + * + * Interrupts must be disabled. + */ +void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) +{ + if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) + __local_trigger(cp, softirq); +} +EXPORT_SYMBOL(__send_remote_softirq); + +/** + * send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @softirq: the softirq for the work + * + * Like __send_remote_softirq except that disabling interrupts and + * computing the current cpu is done for the caller. + */ +void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + unsigned long flags; + int this_cpu; + + local_irq_save(flags); + this_cpu = smp_processor_id(); + __send_remote_softirq(cp, cpu, this_cpu, softirq); + local_irq_restore(flags); +} +EXPORT_SYMBOL(send_remote_softirq); + +static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + int cpu = (unsigned long) hcpu; + int i; + + local_irq_disable(); + for (i = 0; i < NR_SOFTIRQS; i++) { + struct list_head *head = &per_cpu(softirq_work_list[i], cpu); + struct list_head *local_head; + + if (list_empty(head)) + continue; + + local_head = &__get_cpu_var(softirq_work_list[i]); + list_splice_init(head, local_head); + raise_softirq_irqoff(i); + } + local_irq_enable(); + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { + .notifier_call = remote_softirq_cpu_notify, +}; + void __init softirq_init(void) { int cpu; for_each_possible_cpu(cpu) { + int i; + per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; + for (i = 0; i < NR_SOFTIRQS; i++) + INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); } + register_hotcpu_notifier(&remote_softirq_cpu_notifier); + open_softirq(TASKLET_SOFTIRQ, tasklet_action); open_softirq(HI_SOFTIRQ, tasklet_hi_action); } -- cgit v1.1 From 719254faa17ffedc87ba0fadb9b34e535c9758d5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Oct 2008 09:59:47 +0200 Subject: NOHZ: unify the nohz function calls in irq_enter() We have two separate nohz function calls in irq_enter() for no good reason. Just call a single NOHZ function from irq_enter() and call the bits in the tick code. Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 10 +++------- kernel/time/tick-sched.c | 13 ++++++++++++- 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 37d67aa..d410014 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -265,16 +265,12 @@ asmlinkage void do_softirq(void) */ void irq_enter(void) { -#ifdef CONFIG_NO_HZ int cpu = smp_processor_id(); + if (idle_cpu(cpu) && !in_interrupt()) - tick_nohz_stop_idle(cpu); -#endif + tick_check_idle(cpu); + __irq_enter(); -#ifdef CONFIG_NO_HZ - if (idle_cpu(cpu)) - tick_nohz_update_jiffies(); -#endif } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b711ffc..fdcf3f9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -155,7 +155,7 @@ void tick_nohz_update_jiffies(void) touch_softlockup_watchdog(); } -void tick_nohz_stop_idle(int cpu) +static void tick_nohz_stop_idle(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); @@ -559,6 +559,17 @@ static inline void tick_nohz_switch_to_nohz(void) { } #endif /* NO_HZ */ /* + * Called from irq_enter to notify about the possible interruption of idle() + */ +void tick_check_idle(int cpu) +{ +#ifdef CONFIG_NO_HZ + tick_nohz_stop_idle(cpu); + tick_nohz_update_jiffies(); +#endif +} + +/* * High resolution timer specific code */ #ifdef CONFIG_HIGH_RES_TIMERS -- cgit v1.1 From c34bec5a44e9486597d78e7a686b2f9088a0564c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Oct 2008 10:04:34 +0200 Subject: NOHZ: split tick_nohz_restart_sched_tick() Split out the clock event device reprogramming. Preparatory patch. Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 49 ++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fdcf3f9..7aedf43 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -377,6 +377,32 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +{ + hrtimer_cancel(&ts->sched_timer); + ts->sched_timer.expires = ts->idle_tick; + + while (1) { + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, + ts->sched_timer.expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + } else { + if (!tick_program_event(ts->sched_timer.expires, 0)) + break; + } + /* Update jiffies and reread time */ + tick_do_update_jiffies64(now); + now = ktime_get(); + } +} + /** * tick_nohz_restart_sched_tick - restart the idle tick from the idle task * @@ -430,28 +456,7 @@ void tick_nohz_restart_sched_tick(void) */ ts->tick_stopped = 0; ts->idle_exittime = now; - hrtimer_cancel(&ts->sched_timer); - ts->sched_timer.expires = ts->idle_tick; - - while (1) { - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, tick_period); - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, - ts->sched_timer.expires, - HRTIMER_MODE_ABS); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - } else { - if (!tick_program_event(ts->sched_timer.expires, 0)) - break; - } - /* Update jiffies and reread time */ - tick_do_update_jiffies64(now); - now = ktime_get(); - } + tick_nohz_restart(ts, now); local_irq_enable(); } -- cgit v1.1 From fb02fbc14d17837b4b7b02dbb36142c16a7bf208 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Oct 2008 10:01:23 +0200 Subject: NOHZ: restart tick device from irq_enter() We did not restart the tick device from irq_enter() to avoid double reprogramming and extra events in the return immediate to idle case. But long lasting softirqs can lead to a situation where jiffies become stale: idle() tick stopped (reprogrammed to next pending timer) halt() interrupt jiffies updated from irq_enter() interrupt handler softirq function 1 runs 20ms softirq function 2 arms a 10ms timer with a stale jiffies value jiffies updated from irq_exit() timer wheel has now an already expired timer (the one added in function 2) timer fires and timer softirq runs This was discovered when debugging a timer problem which happend only when the ath5k driver is active. The debugging proved that there is a softirq function running for more than 20ms, which is a bug by itself. To solve this we restart the tick timer right from irq_enter(), but do not go through the other functions which are necessary to return from idle when need_resched() is set. Reported-by: Elias Oltmanns Signed-off-by: Thomas Gleixner Tested-by: Elias Oltmanns --- kernel/time/tick-broadcast.c | 13 +++++++++++++ kernel/time/tick-internal.h | 2 ++ kernel/time/tick-sched.c | 31 +++++++++++++++++++++++-------- 3 files changed, 38 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index cb01cd8..f98a1b7 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -384,6 +384,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) } /* + * Called from irq_enter() when idle was interrupted to reenable the + * per cpu device. + */ +void tick_check_oneshot_broadcast(int cpu) +{ + if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + + clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); + } +} + +/* * Handle oneshot mode broadcasting */ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 4692487..b1c05bf 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -36,6 +36,7 @@ extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); extern int tick_broadcast_oneshot_active(void); +extern void tick_check_oneshot_broadcast(int cpu); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { @@ -45,6 +46,7 @@ static inline void tick_broadcast_oneshot_control(unsigned long reason) { } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline void tick_check_oneshot_broadcast(int cpu) { } # endif /* !BROADCAST */ #else /* !ONESHOT */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7aedf43..0581c11 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -508,10 +508,6 @@ static void tick_nohz_handler(struct clock_event_device *dev) update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); - /* Do not restart, when we are in the idle loop */ - if (ts->tick_stopped) - return; - while (tick_nohz_reprogram(ts, now)) { now = ktime_get(); tick_do_update_jiffies64(now); @@ -557,6 +553,27 @@ static void tick_nohz_switch_to_nohz(void) smp_processor_id()); } +/* + * When NOHZ is enabled and the tick is stopped, we need to kick the + * tick timer from irq_enter() so that the jiffies update is kept + * alive during long running softirqs. That's ugly as hell, but + * correctness is key even if we need to fix the offending softirq in + * the first place. + * + * Note, this is different to tick_nohz_restart. We just kick the + * timer and do not touch the other magic bits which need to be done + * when idle is left. + */ +static void tick_nohz_kick_tick(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + if (!ts->tick_stopped) + return; + + tick_nohz_restart(ts, ktime_get()); +} + #else static inline void tick_nohz_switch_to_nohz(void) { } @@ -568,9 +585,11 @@ static inline void tick_nohz_switch_to_nohz(void) { } */ void tick_check_idle(int cpu) { + tick_check_oneshot_broadcast(cpu); #ifdef CONFIG_NO_HZ tick_nohz_stop_idle(cpu); tick_nohz_update_jiffies(); + tick_nohz_kick_tick(cpu); #endif } @@ -627,10 +646,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) profile_tick(CPU_PROFILING); } - /* Do not restart, when we are in the idle loop */ - if (ts->tick_stopped) - return HRTIMER_NORESTART; - hrtimer_forward(timer, now, tick_period); return HRTIMER_RESTART; -- cgit v1.1 From e67ef25a35b949561a9bd77693523ec94ab4a278 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Sep 2008 23:50:23 +0200 Subject: timer_list: print real timer address The current timer_list output prints the address of the on stack copy of the active hrtimer instead of the hrtimer itself. Print the address of the real timer instead. Signed-off-by: Thomas Gleixner --- kernel/time/timer_list.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index a40e20f..ec9ea6c 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -47,13 +47,14 @@ static void print_name_offset(struct seq_file *m, void *sym) } static void -print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) +print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, + int idx, u64 now) { #ifdef CONFIG_TIMER_STATS char tmp[TASK_COMM_LEN + 1]; #endif SEQ_printf(m, " #%d: ", idx); - print_name_offset(m, timer); + print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); SEQ_printf(m, ", S:%02lx", timer->state); @@ -99,7 +100,7 @@ next_one: tmp = *timer; spin_unlock_irqrestore(&base->cpu_base->lock, flags); - print_timer(m, &tmp, i, now); + print_timer(m, timer, &tmp, i, now); next++; goto next_one; } -- cgit v1.1 From c5b77a3d3a716a5c61a1999d7f2a78e9c39fd1b0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 17:31:41 +0200 Subject: timer_list: print cpu number of clockevents device The per cpu clock events device output of timer_list lacks an association of the device to the cpu which is annoying when looking at the output of /proc/timer_list from a 128 way system. Add the CPU number info and mark the broadcast device in the device list printout. Signed-off-by: Thomas Gleixner --- kernel/time/timer_list.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ec9ea6c..5479c6e 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -184,12 +184,16 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) #ifdef CONFIG_GENERIC_CLOCKEVENTS static void -print_tickdevice(struct seq_file *m, struct tick_device *td) +print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; SEQ_printf(m, "\n"); SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); + if (cpu < 0) + SEQ_printf(m, "Broadcast device\n"); + else + SEQ_printf(m, "Per CPU device: %d\n", cpu); SEQ_printf(m, "Clock Event Device: "); if (!dev) { @@ -223,7 +227,7 @@ static void timer_list_show_tickdevices(struct seq_file *m) int cpu; #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST - print_tickdevice(m, tick_get_broadcast_device()); + print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %08lx\n", tick_get_broadcast_mask()->bits[0]); #ifdef CONFIG_TICK_ONESHOT @@ -233,7 +237,7 @@ static void timer_list_show_tickdevices(struct seq_file *m) SEQ_printf(m, "\n"); #endif for_each_online_cpu(cpu) - print_tickdevice(m, tick_get_device(cpu)); + print_tickdevice(m, tick_get_device(cpu), cpu); SEQ_printf(m, "\n"); } #else -- cgit v1.1 From 870e2a284567714335d125c390366dce882d726f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 17:41:55 +0200 Subject: timer_list: add base address to clock base The base address of a (per cpu) clock base is a useful debug info. Add it and bump the version number of timer_lists. Signed-off-by: Thomas Gleixner --- kernel/time/timer_list.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 5479c6e..f642691 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -110,6 +110,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %Lu nsecs\n", @@ -249,7 +250,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.3\n"); + SEQ_printf(m, "Timer List Version: v0.4\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); -- cgit v1.1