From b352bc1cbc29134a356b5c16ee2281807a7b984e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Mar 2013 14:25:32 +0100 Subject: tick: Convert broadcast cpu bitmaps to cpumask_var_t Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/20130306111537.366394000@linutronix.de Cc: Rusty Russell --- kernel/time/tick-broadcast.c | 86 ++++++++++++++++++++++---------------------- kernel/time/tick-common.c | 1 + kernel/time/tick-internal.h | 3 +- 3 files changed, 46 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2fb8cb8..35b8875 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -28,9 +28,8 @@ */ static struct tick_device tick_broadcast_device; -/* FIXME: Use cpumask_var_t. */ -static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); -static DECLARE_BITMAP(tmpmask, NR_CPUS); +static cpumask_var_t tick_broadcast_mask; +static cpumask_var_t tmpmask; static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); static int tick_broadcast_force; @@ -50,7 +49,7 @@ struct tick_device *tick_get_broadcast_device(void) struct cpumask *tick_get_broadcast_mask(void) { - return to_cpumask(tick_broadcast_mask); + return tick_broadcast_mask; } /* @@ -74,7 +73,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) clockevents_exchange_device(tick_broadcast_device.evtdev, dev); tick_broadcast_device.evtdev = dev; - if (!cpumask_empty(tick_get_broadcast_mask())) + if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(dev); return 1; } @@ -123,7 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) if (!tick_device_is_functional(dev)) { dev->event_handler = tick_handle_periodic; tick_device_setup_broadcast_func(dev); - cpumask_set_cpu(cpu, tick_get_broadcast_mask()); + cpumask_set_cpu(cpu, tick_broadcast_mask); tick_broadcast_start_periodic(tick_broadcast_device.evtdev); ret = 1; } else { @@ -134,7 +133,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) */ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { int cpu = smp_processor_id(); - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_mask); tick_broadcast_clear_oneshot(cpu); } else { tick_device_setup_broadcast_func(dev); @@ -198,9 +197,8 @@ static void tick_do_periodic_broadcast(void) { raw_spin_lock(&tick_broadcast_lock); - cpumask_and(to_cpumask(tmpmask), - cpu_online_mask, tick_get_broadcast_mask()); - tick_do_broadcast(to_cpumask(tmpmask)); + cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); + tick_do_broadcast(tmpmask); raw_spin_unlock(&tick_broadcast_lock); } @@ -263,13 +261,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason) if (!tick_device_is_functional(dev)) goto out; - bc_stopped = cpumask_empty(tick_get_broadcast_mask()); + bc_stopped = cpumask_empty(tick_broadcast_mask); switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { - cpumask_set_cpu(cpu, tick_get_broadcast_mask()); + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); @@ -279,8 +276,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) break; case CLOCK_EVT_NOTIFY_BROADCAST_OFF: if (!tick_broadcast_force && - cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); @@ -288,7 +284,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) break; } - if (cpumask_empty(tick_get_broadcast_mask())) { + if (cpumask_empty(tick_broadcast_mask)) { if (!bc_stopped) clockevents_shutdown(bc); } else if (bc_stopped) { @@ -337,10 +333,10 @@ void tick_shutdown_broadcast(unsigned int *cpup) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_mask); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { - if (bc && cpumask_empty(tick_get_broadcast_mask())) + if (bc && cpumask_empty(tick_broadcast_mask)) clockevents_shutdown(bc); } @@ -376,13 +372,13 @@ int tick_resume_broadcast(void) switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: - if (!cpumask_empty(tick_get_broadcast_mask())) + if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(bc); broadcast = cpumask_test_cpu(smp_processor_id(), - tick_get_broadcast_mask()); + tick_broadcast_mask); break; case TICKDEV_MODE_ONESHOT: - if (!cpumask_empty(tick_get_broadcast_mask())) + if (!cpumask_empty(tick_broadcast_mask)) broadcast = tick_resume_broadcast_oneshot(bc); break; } @@ -395,15 +391,14 @@ int tick_resume_broadcast(void) #ifdef CONFIG_TICK_ONESHOT -/* FIXME: use cpumask_var_t. */ -static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); +static cpumask_var_t tick_broadcast_oneshot_mask; /* * Exposed for debugging: see timer_list.c */ struct cpumask *tick_get_broadcast_oneshot_mask(void) { - return to_cpumask(tick_broadcast_oneshot_mask); + return tick_broadcast_oneshot_mask; } static int tick_broadcast_set_event(ktime_t expires, int force) @@ -428,7 +423,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) */ void tick_check_oneshot_broadcast(int cpu) { - if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { + if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { struct tick_device *td = &per_cpu(tick_cpu_device, cpu); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); @@ -448,13 +443,13 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; - cpumask_clear(to_cpumask(tmpmask)); + cpumask_clear(tmpmask); now = ktime_get(); /* Find all expired events */ - for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { + for_each_cpu(cpu, tick_broadcast_oneshot_mask) { td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev->next_event.tv64 <= now.tv64) - cpumask_set_cpu(cpu, to_cpumask(tmpmask)); + cpumask_set_cpu(cpu, tmpmask); else if (td->evtdev->next_event.tv64 < next_event.tv64) next_event.tv64 = td->evtdev->next_event.tv64; } @@ -462,7 +457,7 @@ again: /* * Wakeup the cpus which have an expired event. */ - tick_do_broadcast(to_cpumask(tmpmask)); + tick_do_broadcast(tmpmask); /* * Two reasons for reprogram: @@ -518,16 +513,13 @@ void tick_broadcast_oneshot_control(unsigned long reason) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { - if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { - cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); if (dev->next_event.tv64 < bc->next_event.tv64) tick_broadcast_set_event(dev->next_event, 1); } } else { - if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { - cpumask_clear_cpu(cpu, - tick_get_broadcast_oneshot_mask()); + if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); if (dev->next_event.tv64 != KTIME_MAX) tick_program_event(dev->next_event, 1); @@ -543,7 +535,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) */ static void tick_broadcast_clear_oneshot(int cpu) { - cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } static void tick_broadcast_init_next_event(struct cpumask *mask, @@ -581,15 +573,14 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) * oneshot_mask bits for those and program the * broadcast device to fire. */ - cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); - cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); - cpumask_or(tick_get_broadcast_oneshot_mask(), - tick_get_broadcast_oneshot_mask(), - to_cpumask(tmpmask)); + cpumask_copy(tmpmask, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tmpmask); + cpumask_or(tick_broadcast_oneshot_mask, + tick_broadcast_oneshot_mask, tmpmask); - if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { + if (was_periodic && !cpumask_empty(tmpmask)) { clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - tick_broadcast_init_next_event(to_cpumask(tmpmask), + tick_broadcast_init_next_event(tmpmask, tick_next_period); tick_broadcast_set_event(tick_next_period, 1); } else @@ -639,7 +630,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) * Clear the broadcast mask flag for the dead cpu, but do not * stop the broadcast device! */ - cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -663,3 +654,12 @@ bool tick_broadcast_oneshot_available(void) } #endif + +void __init tick_broadcast_init(void) +{ + alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); + alloc_cpumask_var(&tmpmask, GFP_NOWAIT); +#ifdef CONFIG_TICK_ONESHOT + alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); +#endif +} diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b1600a6..74413e3 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -416,4 +416,5 @@ static struct notifier_block tick_notifier = { void __init tick_init(void) { clockevents_register_notifier(&tick_notifier); + tick_broadcast_init(); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index cf3e59e..46d9bd0 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -94,7 +94,7 @@ extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); extern void tick_shutdown_broadcast(unsigned int *cpup); extern void tick_suspend_broadcast(void); extern int tick_resume_broadcast(void); - +extern void tick_broadcast_init(void); extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); @@ -119,6 +119,7 @@ static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } static inline void tick_shutdown_broadcast(unsigned int *cpup) { } static inline void tick_suspend_broadcast(void) { } static inline int tick_resume_broadcast(void) { return 0; } +static inline void tick_broadcast_init(void) { } /* * Set the periodic handler in non broadcast mode -- cgit v1.1 From f9ae39d04ccdec8d8ecf532191b7056c279a22c0 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sat, 2 Mar 2013 11:10:10 +0100 Subject: tick: Pass broadcast device to tick_broadcast_set_event() Pass the broadcast timer to tick_broadcast_set_event() instead of reevaluating tick_broadcast_device.evtdev. [ tglx: Massaged changelog ] Signed-off-by: Daniel Lezcano Cc: viresh.kumar@linaro.org Cc: jacob.jun.pan@linux.intel.com Cc: linux-arm-kernel@lists.infradead.org Cc: santosh.shilimkar@ti.com Cc: linaro-kernel@lists.linaro.org Cc: patches@linaro.org Cc: rickard.andersson@stericsson.com Cc: vincent.guittot@linaro.org Cc: linus.walleij@stericsson.com Cc: john.stultz@linaro.org Link: http://lkml.kernel.org/r/1362219013-18173-2-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 35b8875..70dd98c 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -401,10 +401,9 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void) return tick_broadcast_oneshot_mask; } -static int tick_broadcast_set_event(ktime_t expires, int force) +static int tick_broadcast_set_event(struct clock_event_device *bc, + ktime_t expires, int force) { - struct clock_event_device *bc = tick_broadcast_device.evtdev; - if (bc->mode != CLOCK_EVT_MODE_ONESHOT) clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); @@ -474,7 +473,7 @@ again: * Rearm the broadcast device. If event expired, * repeat the above */ - if (tick_broadcast_set_event(next_event, 0)) + if (tick_broadcast_set_event(dev, next_event, 0)) goto again; } raw_spin_unlock(&tick_broadcast_lock); @@ -516,7 +515,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); if (dev->next_event.tv64 < bc->next_event.tv64) - tick_broadcast_set_event(dev->next_event, 1); + tick_broadcast_set_event(bc, dev->next_event, 1); } } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { @@ -582,7 +581,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); tick_broadcast_init_next_event(tmpmask, tick_next_period); - tick_broadcast_set_event(tick_next_period, 1); + tick_broadcast_set_event(bc, tick_next_period, 1); } else bc->next_event.tv64 = KTIME_MAX; } else { -- cgit v1.1 From d2348fb6fdc6d671ad45b62db237f76c8c115603 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sat, 2 Mar 2013 11:10:11 +0100 Subject: tick: Dynamically set broadcast irq affinity When a cpu goes to a deep idle state where its local timer is shutdown, it notifies the time frame work to use the broadcast timer instead. Unfortunately, the broadcast device could wake up any CPU, including an idle one which is not concerned by the wake up at all. So in the worst case an idle CPU will wake up to send an IPI to the CPU whose timer expired. Provide an opt-in feature CLOCK_EVT_FEAT_DYNIRQ which tells the core that is should set the interrupt affinity of the broadcast interrupt to the cpu which has the earliest expiry time. This avoids unnecessary spurious wakeups and IPIs. [ tglx: Adopted to cpumask rework, silenced an uninitialized warning, massaged changelog ] Signed-off-by: Daniel Lezcano Cc: viresh.kumar@linaro.org Cc: jacob.jun.pan@linux.intel.com Cc: linux-arm-kernel@lists.infradead.org Cc: santosh.shilimkar@ti.com Cc: linaro-kernel@lists.linaro.org Cc: patches@linaro.org Cc: rickard.andersson@stericsson.com Cc: vincent.guittot@linaro.org Cc: linus.walleij@stericsson.com Cc: john.stultz@linaro.org Link: http://lkml.kernel.org/r/1362219013-18173-3-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 70dd98c..380910d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -401,13 +401,34 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void) return tick_broadcast_oneshot_mask; } -static int tick_broadcast_set_event(struct clock_event_device *bc, +/* + * Set broadcast interrupt affinity + */ +static void tick_broadcast_set_affinity(struct clock_event_device *bc, + const struct cpumask *cpumask) +{ + if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ)) + return; + + if (cpumask_equal(bc->cpumask, cpumask)) + return; + + bc->cpumask = cpumask; + irq_set_affinity(bc->irq, bc->cpumask); +} + +static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, ktime_t expires, int force) { + int ret; + if (bc->mode != CLOCK_EVT_MODE_ONESHOT) clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - return clockevents_program_event(bc, expires, force); + ret = clockevents_program_event(bc, expires, force); + if (!ret) + tick_broadcast_set_affinity(bc, cpumask_of(cpu)); + return ret; } int tick_resume_broadcast_oneshot(struct clock_event_device *bc) @@ -436,7 +457,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) { struct tick_device *td; ktime_t now, next_event; - int cpu; + int cpu, next_cpu = 0; raw_spin_lock(&tick_broadcast_lock); again: @@ -447,10 +468,12 @@ again: /* Find all expired events */ for_each_cpu(cpu, tick_broadcast_oneshot_mask) { td = &per_cpu(tick_cpu_device, cpu); - if (td->evtdev->next_event.tv64 <= now.tv64) + if (td->evtdev->next_event.tv64 <= now.tv64) { cpumask_set_cpu(cpu, tmpmask); - else if (td->evtdev->next_event.tv64 < next_event.tv64) + } else if (td->evtdev->next_event.tv64 < next_event.tv64) { next_event.tv64 = td->evtdev->next_event.tv64; + next_cpu = cpu; + } } /* @@ -473,7 +496,7 @@ again: * Rearm the broadcast device. If event expired, * repeat the above */ - if (tick_broadcast_set_event(dev, next_event, 0)) + if (tick_broadcast_set_event(dev, next_cpu, next_event, 0)) goto again; } raw_spin_unlock(&tick_broadcast_lock); @@ -515,7 +538,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); if (dev->next_event.tv64 < bc->next_event.tv64) - tick_broadcast_set_event(bc, dev->next_event, 1); + tick_broadcast_set_event(bc, cpu, dev->next_event, 1); } } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { @@ -581,7 +604,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); tick_broadcast_init_next_event(tmpmask, tick_next_period); - tick_broadcast_set_event(bc, tick_next_period, 1); + tick_broadcast_set_event(bc, cpu, tick_next_period, 1); } else bc->next_event.tv64 = KTIME_MAX; } else { -- cgit v1.1 From 26517f3e99248668315aee9460dcea21628cdd7f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Mar 2013 11:18:35 +0000 Subject: tick: Avoid programming the local cpu timer if broadcast pending If the local cpu timer stops in deep idle, we arm the broadcast device and get woken by an IPI. Now when we return from deep idle we reenable the local cpu timer unconditionally before handling the IPI. But that's a pointless exercise: the timer is already expired and the IPI is on the way. And it's an expensive exercise as we use the forced reprogramming mode so that we do not lose a timer event. This forced reprogramming will loop at least once in the retry. To avoid this reprogramming, we mark the cpu in a pending bit mask before we send the IPI. Now when the IPI target cpu wakes up, it will see the pending bit set and skip the reprogramming. The reprogramming of the cpu local timer will happen in the IPI handler which runs the cpu local timer interrupt function. Reported-by: Jason Liu Signed-off-by: Thomas Gleixner Cc: LAK Cc: John Stultz Cc: Arjan van de Veen Cc: Lorenzo Pieralisi Tested-by: Santosh Shilimkar Link: http://lkml.kernel.org/r/20130306111537.431082074@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 380910d..005c0ca 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -392,6 +392,7 @@ int tick_resume_broadcast(void) #ifdef CONFIG_TICK_ONESHOT static cpumask_var_t tick_broadcast_oneshot_mask; +static cpumask_var_t tick_broadcast_pending_mask; /* * Exposed for debugging: see timer_list.c @@ -470,6 +471,12 @@ again: td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev->next_event.tv64 <= now.tv64) { cpumask_set_cpu(cpu, tmpmask); + /* + * Mark the remote cpu in the pending mask, so + * it can avoid reprogramming the cpu local + * timer in tick_broadcast_oneshot_control(). + */ + cpumask_set_cpu(cpu, tick_broadcast_pending_mask); } else if (td->evtdev->next_event.tv64 < next_event.tv64) { next_event.tv64 = td->evtdev->next_event.tv64; next_cpu = cpu; @@ -535,6 +542,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { + WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); if (dev->next_event.tv64 < bc->next_event.tv64) @@ -543,10 +551,25 @@ void tick_broadcast_oneshot_control(unsigned long reason) } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - if (dev->next_event.tv64 != KTIME_MAX) - tick_program_event(dev->next_event, 1); + if (dev->next_event.tv64 == KTIME_MAX) + goto out; + /* + * The cpu which was handling the broadcast + * timer marked this cpu in the broadcast + * pending mask and fired the broadcast + * IPI. So we are going to handle the expired + * event anyway via the broadcast IPI + * handler. No need to reprogram the timer + * with an already expired event. + */ + if (cpumask_test_and_clear_cpu(cpu, + tick_broadcast_pending_mask)) + goto out; + + tick_program_event(dev->next_event, 1); } } +out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -683,5 +706,6 @@ void __init tick_broadcast_init(void) alloc_cpumask_var(&tmpmask, GFP_NOWAIT); #ifdef CONFIG_TICK_ONESHOT alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); + alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); #endif } -- cgit v1.1 From 989dcb645ca715129c5a2b39102c8334a20d9615 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Mar 2013 11:18:35 +0000 Subject: tick: Handle broadcast wakeup of multiple cpus Some brilliant hardware implementations wake multiple cores when the broadcast timer fires. This leads to the following interesting problem: CPU0 CPU1 wakeup from idle wakeup from idle leave broadcast mode leave broadcast mode restart per cpu timer restart per cpu timer go back to idle handle broadcast (empty mask) enter broadcast mode programm broadcast device enter broadcast mode programm broadcast device So what happens is that due to the forced reprogramming of the cpu local timer, we need to set a event in the future. Now if we manage to go back to idle before the timer fires, we switch off the timer and arm the broadcast device with an already expired time (covered by forced mode). So in the worst case we repeat the above ping pong forever. Unfortunately we have no information about what caused the wakeup, but we can check current time against the expiry time of the local cpu. If the local event is already in the past, we know that the broadcast timer is about to fire and send an IPI. So we mark ourself as an IPI target even if we left broadcast mode and avoid the reprogramming of the local cpu timer. This still leaves the possibility that a CPU which is not handling the broadcast interrupt is going to reach idle again before the IPI arrives. This can't be solved in the core code and will be handled in follow up patches. Reported-by: Jason Liu Signed-off-by: Thomas Gleixner Cc: LAK Cc: John Stultz Cc: Arjan van de Veen Cc: Lorenzo Pieralisi Tested-by: Santosh Shilimkar Link: http://lkml.kernel.org/r/20130306111537.492045206@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 59 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 005c0ca..2100aad 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -393,6 +393,7 @@ int tick_resume_broadcast(void) static cpumask_var_t tick_broadcast_oneshot_mask; static cpumask_var_t tick_broadcast_pending_mask; +static cpumask_var_t tick_broadcast_force_mask; /* * Exposed for debugging: see timer_list.c @@ -483,6 +484,10 @@ again: } } + /* Take care of enforced broadcast requests */ + cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); + cpumask_clear(tick_broadcast_force_mask); + /* * Wakeup the cpus which have an expired event. */ @@ -518,6 +523,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) struct clock_event_device *bc, *dev; struct tick_device *td; unsigned long flags; + ktime_t now; int cpu; /* @@ -545,7 +551,16 @@ void tick_broadcast_oneshot_control(unsigned long reason) WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); - if (dev->next_event.tv64 < bc->next_event.tv64) + /* + * We only reprogram the broadcast timer if we + * did not mark ourself in the force mask and + * if the cpu local event is earlier than the + * broadcast event. If the current CPU is in + * the force mask, then we are going to be + * woken by the IPI right away. + */ + if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) && + dev->next_event.tv64 < bc->next_event.tv64) tick_broadcast_set_event(bc, cpu, dev->next_event, 1); } } else { @@ -566,6 +581,47 @@ void tick_broadcast_oneshot_control(unsigned long reason) tick_broadcast_pending_mask)) goto out; + /* + * If the pending bit is not set, then we are + * either the CPU handling the broadcast + * interrupt or we got woken by something else. + * + * We are not longer in the broadcast mask, so + * if the cpu local expiry time is already + * reached, we would reprogram the cpu local + * timer with an already expired event. + * + * This can lead to a ping-pong when we return + * to idle and therefor rearm the broadcast + * timer before the cpu local timer was able + * to fire. This happens because the forced + * reprogramming makes sure that the event + * will happen in the future and depending on + * the min_delta setting this might be far + * enough out that the ping-pong starts. + * + * If the cpu local next_event has expired + * then we know that the broadcast timer + * next_event has expired as well and + * broadcast is about to be handled. So we + * avoid reprogramming and enforce that the + * broadcast handler, which did not run yet, + * will invoke the cpu local handler. + * + * We cannot call the handler directly from + * here, because we might be in a NOHZ phase + * and we did not go through the irq_enter() + * nohz fixups. + */ + now = ktime_get(); + if (dev->next_event.tv64 <= now.tv64) { + cpumask_set_cpu(cpu, tick_broadcast_force_mask); + goto out; + } + /* + * We got woken by something else. Reprogram + * the cpu local timer device. + */ tick_program_event(dev->next_event, 1); } } @@ -707,5 +763,6 @@ void __init tick_broadcast_init(void) #ifdef CONFIG_TICK_ONESHOT alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); + alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); #endif } -- cgit v1.1 From eaa907c546f76222227dfc41784b22588af1e3d7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Mar 2013 11:18:36 +0000 Subject: tick: Provide a check for a forced broadcast pending On the CPU which gets woken along with the target CPU of the broadcast the following happens: deep_idle() <-- spurious wakeup broadcast_exit() set forced bit enable interrupts <-- Nothing happens disable interrupts broadcast_enter() <-- Here we observe the forced bit is set deep_idle() Now after that the target CPU of the broadcast runs the broadcast handler and finds the other CPU in both the broadcast and the forced mask, sends the IPI and stuff gets back to normal. So it's not actually harmful, just more evidence for the theory, that hardware designers have access to very special drug supplies. Now there is no point in going back to deep idle just to wake up again right away via an IPI. Provide a check which allows the idle code to avoid the deep idle transition. Signed-off-by: Thomas Gleixner Cc: LAK Cc: John Stultz Cc: Arjan van de Veen Cc: Lorenzo Pieralisi Tested-by: Santosh Shilimkar Cc: Jason Liu Link: http://lkml.kernel.org/r/20130306111537.565418308@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2100aad..d76d816 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -404,6 +404,18 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void) } /* + * Called before going idle with interrupts disabled. Checks whether a + * broadcast event from the other core is about to happen. We detected + * that in tick_broadcast_oneshot_control(). The callsite can use this + * to avoid a deep idle transition as we are about to get the + * broadcast IPI right away. + */ +int tick_check_broadcast_expired(void) +{ + return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); +} + +/* * Set broadcast interrupt affinity */ static void tick_broadcast_set_affinity(struct clock_event_device *bc, -- cgit v1.1 From c30bd09915ea243603d7803d53de890c4a6f1474 Mon Sep 17 00:00:00 2001 From: Dong Zhu Date: Thu, 6 Dec 2012 22:03:34 +0800 Subject: timekeeping: Avoid adjust kernel time once hwclock kept in UTC time If the Hardware Clock kept in local time,kernel will adjust the time to be UTC time.But if Hardware Clock kept in UTC time,system will make a dummy settimeofday call first (sys_tz.tz_minuteswest = 0) to make sure the time is not shifted,so at this point I think maybe it is not necessary to set the kernel time once the sys_tz.tz_minuteswest is zero. Signed-off-by: Dong Zhu [jstultz: Updated to merge with conflicting changes ] Signed-off-by: John Stultz --- kernel/time.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index f8342a4..effac57 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -138,13 +138,14 @@ int persistent_clock_is_local; */ static inline void warp_clock(void) { - struct timespec adjust; + if (sys_tz.tz_minuteswest != 0) { + struct timespec adjust; - adjust = current_kernel_time(); - if (sys_tz.tz_minuteswest != 0) persistent_clock_is_local = 1; - adjust.tv_sec += sys_tz.tz_minuteswest * 60; - do_settimeofday(&adjust); + adjust = current_kernel_time(); + adjust.tv_sec += sys_tz.tz_minuteswest * 60; + do_settimeofday(&adjust); + } } /* -- cgit v1.1 From 7859e404ae73fe4f38b8cfc1af19ea82f153084e Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 22 Feb 2013 12:33:29 -0800 Subject: timekeeping: Use inject_offset in warp_clock When warping the clock (from a local time RTC), use timekeeping_inject_offset() to atomically add the offset. This avoids any minor time error caused by the delay between reading the time, and then setting the adjusted time. Signed-off-by: John Stultz --- kernel/time.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index effac57..d3617db 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -142,9 +142,9 @@ static inline void warp_clock(void) struct timespec adjust; persistent_clock_is_local = 1; - adjust = current_kernel_time(); - adjust.tv_sec += sys_tz.tz_minuteswest * 60; - do_settimeofday(&adjust); + adjust.tv_sec = sys_tz.tz_minuteswest * 60; + adjust.tv_nsec = 0; + timekeeping_inject_offset(&adjust); } } -- cgit v1.1 From e445cf1c4257cc0238d72e4129eb4739f46fd3de Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 12 Mar 2013 11:56:48 +0800 Subject: timekeeping: utilize the suspend-nonstop clocksource to count suspended time There are some new processors whose TSC clocksource won't stop during suspend. Currently, after system resumes, kernel will use persistent clock or RTC to compensate the sleep time, but with these nonstop clocksources, we could skip the special compensation from external sources, and just use current clocksource for time recounting. This can solve some time drift bugs caused by some not-so-accurate or error-prone RTC devices. The current way to count suspended time is first try to use the persistent clock, and then try the RTC if persistent clock can't be used. This patch will change the trying order to: suspend-nonstop clocksource -> persistent clock -> RTC When counting the sleep time with nonstop clocksource, use an accurate way suggested by Jason Gunthorpe to cover very large delta cycles. Signed-off-by: Feng Tang [jstultz: Small optimization, avoiding re-reading the clocksource] Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 58 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9a0bc98..0355f12 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -788,22 +788,66 @@ void timekeeping_inject_sleeptime(struct timespec *delta) static void timekeeping_resume(void) { struct timekeeper *tk = &timekeeper; + struct clocksource *clock = tk->clock; unsigned long flags; - struct timespec ts; + struct timespec ts_new, ts_delta; + cycle_t cycle_now, cycle_delta; + bool suspendtime_found = false; - read_persistent_clock(&ts); + read_persistent_clock(&ts_new); clockevents_resume(); clocksource_resume(); write_seqlock_irqsave(&tk->lock, flags); - if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { - ts = timespec_sub(ts, timekeeping_suspend_time); - __timekeeping_inject_sleeptime(tk, &ts); + /* + * After system resumes, we need to calculate the suspended time and + * compensate it for the OS time. There are 3 sources that could be + * used: Nonstop clocksource during suspend, persistent clock and rtc + * device. + * + * One specific platform may have 1 or 2 or all of them, and the + * preference will be: + * suspend-nonstop clocksource -> persistent clock -> rtc + * The less preferred source will only be tried if there is no better + * usable source. The rtc part is handled separately in rtc core code. + */ + cycle_now = clock->read(clock); + if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && + cycle_now > clock->cycle_last) { + u64 num, max = ULLONG_MAX; + u32 mult = clock->mult; + u32 shift = clock->shift; + s64 nsec = 0; + + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* + * "cycle_delta * mutl" may cause 64 bits overflow, if the + * suspended time is too long. In that case we need do the + * 64 bits math carefully + */ + do_div(max, mult); + if (cycle_delta > max) { + num = div64_u64(cycle_delta, max); + nsec = (((u64) max * mult) >> shift) * num; + cycle_delta -= num * max; + } + nsec += ((u64) cycle_delta * mult) >> shift; + + ts_delta = ns_to_timespec(nsec); + suspendtime_found = true; + } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { + ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); + suspendtime_found = true; } - /* re-base the last cycle value */ - tk->clock->cycle_last = tk->clock->read(tk->clock); + + if (suspendtime_found) + __timekeeping_inject_sleeptime(tk, &ts_delta); + + /* Re-base the last cycle value */ + clock->cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update(tk, false); -- cgit v1.1 From cc244ddae6d4c6902ac9d7d64023534f8c44a7eb Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 3 May 2012 12:30:07 -0700 Subject: timekeeping: Move TAI managment into timekeeping core from ntp Currently NTP manages the TAI offset. Since there's plans for a CLOCK_TAI clockid, push the TAI management into the timekeeping core. CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: John Stultz --- kernel/time/ntp.c | 18 ++++++++++-------- kernel/time/timekeeping.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 072bb06..59e2749 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -53,9 +53,6 @@ static int time_state = TIME_OK; /* clock status bits: */ static int time_status = STA_UNSYNC; -/* TAI offset (secs): */ -static long time_tai; - /* time adjustment (nsecs): */ static s64 time_offset; @@ -415,7 +412,6 @@ int second_overflow(unsigned long secs) else if (secs % 86400 == 0) { leap = -1; time_state = TIME_OOP; - time_tai++; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); } @@ -425,7 +421,6 @@ int second_overflow(unsigned long secs) time_state = TIME_OK; else if ((secs + 1) % 86400 == 0) { leap = 1; - time_tai--; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); @@ -579,7 +574,9 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) * Called with ntp_lock held, so we can access and modify * all the global NTP state: */ -static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) +static inline void process_adjtimex_modes(struct timex *txc, + struct timespec *ts, + s32 *time_tai) { if (txc->modes & ADJ_STATUS) process_adj_status(txc, ts); @@ -613,7 +610,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts } if (txc->modes & ADJ_TAI && txc->constant > 0) - time_tai = txc->constant; + *time_tai = txc->constant; if (txc->modes & ADJ_OFFSET) ntp_update_offset(txc->offset); @@ -632,6 +629,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts int do_adjtimex(struct timex *txc) { struct timespec ts; + u32 time_tai, orig_tai; int result; /* Validate the data before disabling interrupts */ @@ -671,6 +669,7 @@ int do_adjtimex(struct timex *txc) } getnstimeofday(&ts); + orig_tai = time_tai = timekeeping_get_tai_offset(); raw_spin_lock_irq(&ntp_lock); @@ -687,7 +686,7 @@ int do_adjtimex(struct timex *txc) /* If there are input parameters, then process them: */ if (txc->modes) - process_adjtimex_modes(txc, &ts); + process_adjtimex_modes(txc, &ts, &time_tai); txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); @@ -716,6 +715,9 @@ int do_adjtimex(struct timex *txc) raw_spin_unlock_irq(&ntp_lock); + if (time_tai != orig_tai) + timekeeping_set_tai_offset(time_tai); + txc->time.tv_sec = ts.tv_sec; txc->time.tv_usec = ts.tv_nsec; if (!(time_status & STA_NANO)) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0355f12..937098a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -513,6 +513,48 @@ error: /* even if we error out, we forwarded the time, so call update */ } EXPORT_SYMBOL(timekeeping_inject_offset); + +/** + * timekeeping_get_tai_offset - Returns current TAI offset from UTC + * + */ +s32 timekeeping_get_tai_offset(void) +{ + struct timekeeper *tk = &timekeeper; + unsigned int seq; + s32 ret; + + do { + seq = read_seqbegin(&tk->lock); + ret = tk->tai_offset; + } while (read_seqretry(&tk->lock, seq)); + + return ret; +} + +/** + * __timekeeping_set_tai_offset - Lock free worker function + * + */ +void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) +{ + tk->tai_offset = tai_offset; +} + +/** + * timekeeping_set_tai_offset - Sets the current TAI offset from UTC + * + */ +void timekeeping_set_tai_offset(s32 tai_offset) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + + write_seqlock_irqsave(&tk->lock, flags); + __timekeeping_set_tai_offset(tk, tai_offset); + write_sequnlock_irqrestore(&tk->lock, flags); +} + /** * change_clocksource - Swaps clocksources if a new one is available * @@ -1143,6 +1185,8 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts)); + __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); + clock_was_set_delayed(); } } -- cgit v1.1 From 1ff3c9677bff7e468e0c487d0ffefe4e901d33f4 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 3 May 2012 12:43:40 -0700 Subject: timekeeping: Add CLOCK_TAI clockid This add a CLOCK_TAI clockid and the needed accessors. CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: John Stultz --- kernel/posix-timers.c | 10 ++++++++++ kernel/time/timekeeping.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 6edbb2c..fbfc5f1 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -221,6 +221,11 @@ static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) return 0; } +static int posix_get_tai(clockid_t which_clock, struct timespec *tp) +{ + timekeeping_clocktai(tp); + return 0; +} /* * Initialize everything, well, just everything in Posix clocks/timers ;) @@ -261,6 +266,10 @@ static __init int init_posix_timers(void) .clock_getres = posix_get_coarse_res, .clock_get = posix_get_monotonic_coarse, }; + struct k_clock clock_tai = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_tai, + }; struct k_clock clock_boottime = { .clock_getres = hrtimer_get_res, .clock_get = posix_get_boottime, @@ -278,6 +287,7 @@ static __init int init_posix_timers(void) posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); + posix_timers_register_clock(CLOCK_TAI, &clock_tai); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 937098a..8a84275 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -379,6 +379,36 @@ void ktime_get_ts(struct timespec *ts) } EXPORT_SYMBOL_GPL(ktime_get_ts); + +/** + * timekeeping_clocktai - Returns the TAI time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void timekeeping_clocktai(struct timespec *ts) +{ + struct timekeeper *tk = &timekeeper; + unsigned long seq; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&tk->lock); + + ts->tv_sec = tk->xtime_sec + tk->tai_offset; + nsecs = timekeeping_get_ns(tk); + + } while (read_seqretry(&tk->lock, seq)); + + ts->tv_nsec = 0; + timespec_add_ns(ts, nsecs); + +} +EXPORT_SYMBOL(timekeeping_clocktai); + + #ifdef CONFIG_NTP_PPS /** -- cgit v1.1 From 90adda98b89aaf68b06014ecf805b6c477daa19b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 21 Jan 2013 17:00:11 -0800 Subject: hrtimer: Add hrtimer support for CLOCK_TAI Add hrtimer support for CLOCK_TAI, as well as posix timer interfaces. Signed-off-by: John Stultz --- kernel/hrtimer.c | 14 +++++++++++++- kernel/posix-timers.c | 6 ++++++ kernel/time/timekeeping.c | 20 +++++++++++++++++++- 3 files changed, 38 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index cc47812..2587207 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -83,6 +83,12 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get_boottime, .resolution = KTIME_LOW_RES, }, + { + .index = HRTIMER_BASE_TAI, + .clockid = CLOCK_TAI, + .get_time = &ktime_get_clocktai, + .resolution = KTIME_LOW_RES, + }, } }; @@ -90,6 +96,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, + [CLOCK_TAI] = HRTIMER_BASE_TAI, }; static inline int hrtimer_clockid_to_base(clockid_t clock_id) @@ -106,8 +113,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { ktime_t xtim, mono, boot; struct timespec xts, tom, slp; + s32 tai_offset; get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); + tai_offset = timekeeping_get_tai_offset(); xtim = timespec_to_ktime(xts); mono = ktime_add(xtim, timespec_to_ktime(tom)); @@ -115,6 +124,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; + base->clock_base[HRTIMER_BASE_TAI].softirq_time = + ktime_add(xtim, ktime_set(tai_offset, 0)); } /* @@ -651,8 +662,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - return ktime_get_update_offsets(offs_real, offs_boot); + return ktime_get_update_offsets(offs_real, offs_boot, offs_tai); } /* diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index fbfc5f1..2a2e173 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -269,6 +269,12 @@ static __init int init_posix_timers(void) struct k_clock clock_tai = { .clock_getres = hrtimer_get_res, .clock_get = posix_get_tai, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, }; struct k_clock clock_boottime = { .clock_getres = hrtimer_get_res, diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8a84275..8061ae0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -67,6 +67,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) tk->wall_to_monotonic = wtm; set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); tk->offs_real = timespec_to_ktime(tmp); + tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0)); } static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) @@ -409,6 +410,20 @@ void timekeeping_clocktai(struct timespec *ts) EXPORT_SYMBOL(timekeeping_clocktai); +/** + * ktime_get_clocktai - Returns the TAI time of day in a ktime + * + * Returns the time of day in a ktime. + */ +ktime_t ktime_get_clocktai(void) +{ + struct timespec ts; + + timekeeping_clocktai(&ts); + return timespec_to_ktime(ts); +} +EXPORT_SYMBOL(ktime_get_clocktai); + #ifdef CONFIG_NTP_PPS /** @@ -569,6 +584,7 @@ s32 timekeeping_get_tai_offset(void) void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) { tk->tai_offset = tai_offset; + tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0)); } /** @@ -1539,7 +1555,8 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, * Returns current monotonic time and updates the offsets * Called from hrtimer_interupt() or retrigger_next_event() */ -ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) +ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, + ktime_t *offs_tai) { struct timekeeper *tk = &timekeeper; ktime_t now; @@ -1554,6 +1571,7 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; + *offs_tai = tk->offs_tai; } while (read_seqretry(&tk->lock, seq)); now = ktime_add_ns(ktime_set(secs, 0), nsecs); -- cgit v1.1 From 23a9537a6999fce16f06ca61fc6cac52c8fbdc86 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Feb 2013 22:51:36 +0000 Subject: timekeeping: Calc stuff once Calculate the cycle interval shifted value once. No functional change, just makes the code more readable. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8061ae0..c442a4c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1250,15 +1250,16 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, u32 shift) { + cycle_t interval = tk->cycle_interval << shift; u64 raw_nsecs; /* If the offset is smaller then a shifted interval, do nothing */ - if (offset < tk->cycle_interval<cycle_interval << shift; - tk->clock->cycle_last += tk->cycle_interval << shift; + offset -= interval; + tk->clock->cycle_last += interval; tk->xtime_nsec += tk->xtime_interval << shift; accumulate_nsecs_to_secs(tk); -- cgit v1.1 From eb93e4d93093615c60cb7dd3dcb24e46bd7d62d4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Feb 2013 22:51:36 +0000 Subject: timekeeping: Make jiffies_lock internal Nothing outside of the timekeeping core needs that lock. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/tick-internal.h | 2 ++ kernel/time/timekeeping.c | 1 + 2 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index cf3e59e..f5c9207 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -4,6 +4,8 @@ #include #include +extern seqlock_t jiffies_lock; + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD #define TICK_DO_TIMER_NONE -1 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c442a4c..b0c648f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -23,6 +23,7 @@ #include #include +#include "tick-internal.h" static struct timekeeper timekeeper; -- cgit v1.1 From 7e40672d930b369c1984457233ec5557aa53bfb8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Feb 2013 22:51:37 +0000 Subject: timekeeping: Move lock out of timekeeper struct Make the lock a separate entity. Preparatory patch for shadow timekeeper structure. Signed-off-by: Thomas Gleixner [Merged with CLOCK_TAI changes] Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 108 +++++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b0c648f..caede71 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -26,6 +26,7 @@ #include "tick-internal.h" static struct timekeeper timekeeper; +static DEFINE_SEQLOCK(timekeeper_lock); /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -212,11 +213,11 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb) unsigned long flags; int ret; - write_seqlock_irqsave(&tk->lock, flags); + write_seqlock_irqsave(&timekeeper_lock, flags); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); /* update timekeeping data */ update_pvclock_gtod(tk); - write_sequnlock_irqrestore(&tk->lock, flags); + write_sequnlock_irqrestore(&timekeeper_lock, flags); return ret; } @@ -230,13 +231,12 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { - struct timekeeper *tk = &timekeeper; unsigned long flags; int ret; - write_seqlock_irqsave(&tk->lock, flags); + write_seqlock_irqsave(&timekeeper_lock, flags); ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); - write_sequnlock_irqrestore(&tk->lock, flags); + write_sequnlock_irqrestore(&timekeeper_lock, flags); return ret; } @@ -296,12 +296,12 @@ int __getnstimeofday(struct timespec *ts) s64 nsecs = 0; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqbegin(&timekeeper_lock); ts->tv_sec = tk->xtime_sec; nsecs = timekeeping_get_ns(tk); - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqretry(&timekeeper_lock, seq)); ts->tv_nsec = 0; timespec_add_ns(ts, nsecs); @@ -337,11 +337,11 @@ ktime_t ktime_get(void) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqbegin(&timekeeper_lock); secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqretry(&timekeeper_lock, seq)); /* * Use ktime_set/ktime_add_ns to create a proper ktime on * 32-bit architectures without CONFIG_KTIME_SCALAR. @@ -368,12 +368,12 @@ void ktime_get_ts(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqbegin(&timekeeper_lock); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqretry(&timekeeper_lock, seq)); ts->tv_sec += tomono.tv_sec; ts->tv_nsec = 0; @@ -397,12 +397,12 @@ void timekeeping_clocktai(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqbegin(&timekeeper_lock); ts->tv_sec = tk->xtime_sec + tk->tai_offset; nsecs = timekeeping_get_ns(tk); - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqretry(&timekeeper_lock, seq)); ts->tv_nsec = 0; timespec_add_ns(ts, nsecs); @@ -445,7 +445,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) WARN_ON_ONCE(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqbegin(&timekeeper_lock); *ts_raw = tk->raw_time; ts_real->tv_sec = tk->xtime_sec; @@ -454,7 +454,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) nsecs_raw = timekeeping_get_ns_raw(tk); nsecs_real = timekeeping_get_ns(tk); - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqretry(&timekeeper_lock, seq)); timespec_add_ns(ts_raw, nsecs_raw); timespec_add_ns(ts_real, nsecs_real); @@ -494,7 +494,7 @@ int do_settimeofday(const struct timespec *tv) if (!timespec_valid_strict(tv)) return -EINVAL; - write_seqlock_irqsave(&tk->lock, flags); + write_seqlock_irqsave(&timekeeper_lock, flags); timekeeping_forward_now(tk); @@ -508,7 +508,7 @@ int do_settimeofday(const struct timespec *tv) timekeeping_update(tk, true); - write_sequnlock_irqrestore(&tk->lock, flags); + write_sequnlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -533,7 +533,7 @@ int timekeeping_inject_offset(struct timespec *ts) if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&tk->lock, flags); + write_seqlock_irqsave(&timekeeper_lock, flags); timekeeping_forward_now(tk); @@ -550,7 +550,7 @@ int timekeeping_inject_offset(struct timespec *ts) error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, true); - write_sequnlock_irqrestore(&tk->lock, flags); + write_sequnlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -571,9 +571,9 @@ s32 timekeeping_get_tai_offset(void) s32 ret; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqbegin(&timekeeper_lock); ret = tk->tai_offset; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqretry(&timekeeper_lock, seq)); return ret; } @@ -597,9 +597,9 @@ void timekeeping_set_tai_offset(s32 tai_offset) struct timekeeper *tk = &timekeeper; unsigned long flags; - write_seqlock_irqsave(&tk->lock, flags); + write_seqlock_irqsave(&timekeeper_lock, flags); __timekeeping_set_tai_offset(tk, tai_offset); - write_sequnlock_irqrestore(&tk->lock, flags); + write_sequnlock_irqrestore(&timekeeper_lock, flags); } /** @@ -615,7 +615,7 @@ static int change_clocksource(void *data) new = (struct clocksource *) data; - write_seqlock_irqsave(&tk->lock, flags); + write_seqlock_irqsave(&timekeeper_lock, flags); timekeeping_forward_now(tk); if (!new->enable || new->enable(new) == 0) { @@ -626,7 +626,7 @@ static int change_clocksource(void *data) } timekeeping_update(tk, true); - write_sequnlock_irqrestore(&tk->lock, flags); + write_sequnlock_irqrestore(&timekeeper_lock, flags); return 0; } @@ -676,11 +676,11 @@ void getrawmonotonic(struct timespec *ts) s64 nsecs; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqbegin(&timekeeper_lock); nsecs = timekeeping_get_ns_raw(tk); *ts = tk->raw_time; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqretry(&timekeeper_lock, seq)); timespec_add_ns(ts, nsecs); } @@ -696,11 +696,11 @@ int timekeeping_valid_for_hres(void) int ret; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqbegin(&timekeeper_lock); ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqretry(&timekeeper_lock, seq)); return ret; } @@ -715,11 +715,11 @@ u64 timekeeping_max_deferment(void) u64 ret; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqbegin(&timekeeper_lock); ret = tk->clock->max_idle_ns; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqretry(&timekeeper_lock, seq)); return ret; } @@ -782,11 +782,9 @@ void __init timekeeping_init(void) boot.tv_nsec = 0; } - seqlock_init(&tk->lock); - ntp_init(); - write_seqlock_irqsave(&tk->lock, flags); + write_seqlock_irqsave(&timekeeper_lock, flags); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); @@ -805,7 +803,7 @@ void __init timekeeping_init(void) tmp.tv_nsec = 0; tk_set_sleep_time(tk, tmp); - write_sequnlock_irqrestore(&tk->lock, flags); + write_sequnlock_irqrestore(&timekeeper_lock, flags); } /* time in seconds when suspend began */ @@ -853,7 +851,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) if (has_persistent_clock()) return; - write_seqlock_irqsave(&tk->lock, flags); + write_seqlock_irqsave(&timekeeper_lock, flags); timekeeping_forward_now(tk); @@ -861,7 +859,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) timekeeping_update(tk, true); - write_sequnlock_irqrestore(&tk->lock, flags); + write_sequnlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -888,7 +886,7 @@ static void timekeeping_resume(void) clockevents_resume(); clocksource_resume(); - write_seqlock_irqsave(&tk->lock, flags); + write_seqlock_irqsave(&timekeeper_lock, flags); /* * After system resumes, we need to calculate the suspended time and @@ -940,7 +938,7 @@ static void timekeeping_resume(void) tk->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update(tk, false); - write_sequnlock_irqrestore(&tk->lock, flags); + write_sequnlock_irqrestore(&timekeeper_lock, flags); touch_softlockup_watchdog(); @@ -959,7 +957,7 @@ static int timekeeping_suspend(void) read_persistent_clock(&timekeeping_suspend_time); - write_seqlock_irqsave(&tk->lock, flags); + write_seqlock_irqsave(&timekeeper_lock, flags); timekeeping_forward_now(tk); timekeeping_suspended = 1; @@ -982,7 +980,7 @@ static int timekeeping_suspend(void) timekeeping_suspend_time = timespec_add(timekeeping_suspend_time, delta_delta); } - write_sequnlock_irqrestore(&tk->lock, flags); + write_sequnlock_irqrestore(&timekeeper_lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); clocksource_suspend(); @@ -1322,7 +1320,7 @@ static void update_wall_time(void) int shift = 0, maxshift; unsigned long flags; - write_seqlock_irqsave(&tk->lock, flags); + write_seqlock_irqsave(&timekeeper_lock, flags); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) @@ -1377,7 +1375,7 @@ static void update_wall_time(void) timekeeping_update(tk, false); out: - write_sequnlock_irqrestore(&tk->lock, flags); + write_sequnlock_irqrestore(&timekeeper_lock, flags); } @@ -1425,13 +1423,13 @@ void get_monotonic_boottime(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqbegin(&timekeeper_lock); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; sleep = tk->total_sleep_time; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqretry(&timekeeper_lock, seq)); ts->tv_sec += tomono.tv_sec + sleep.tv_sec; ts->tv_nsec = 0; @@ -1490,10 +1488,10 @@ struct timespec current_kernel_time(void) unsigned long seq; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqbegin(&timekeeper_lock); now = tk_xtime(tk); - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqretry(&timekeeper_lock, seq)); return now; } @@ -1506,11 +1504,11 @@ struct timespec get_monotonic_coarse(void) unsigned long seq; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqbegin(&timekeeper_lock); now = tk_xtime(tk); mono = tk->wall_to_monotonic; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqretry(&timekeeper_lock, seq)); set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); @@ -1541,11 +1539,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, unsigned long seq; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqbegin(&timekeeper_lock); *xtim = tk_xtime(tk); *wtom = tk->wall_to_monotonic; *sleep = tk->total_sleep_time; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqretry(&timekeeper_lock, seq)); } #ifdef CONFIG_HIGH_RES_TIMERS @@ -1566,7 +1564,7 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, u64 secs, nsecs; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqbegin(&timekeeper_lock); secs = tk->xtime_sec; nsecs = timekeeping_get_ns(tk); @@ -1574,7 +1572,7 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqretry(&timekeeper_lock, seq)); now = ktime_add_ns(ktime_set(secs, 0), nsecs); now = ktime_sub(now, *offs_real); @@ -1592,9 +1590,9 @@ ktime_t ktime_get_monotonic_offset(void) struct timespec wtom; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqbegin(&timekeeper_lock); wtom = tk->wall_to_monotonic; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqretry(&timekeeper_lock, seq)); return timespec_to_ktime(wtom); } -- cgit v1.1 From 9a7a71b1d0968fc2bd602b7481cde1d4872e01ff Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Feb 2013 22:51:38 +0000 Subject: timekeeping: Split timekeeper_lock into lock and seqcount We want to shorten the seqcount write hold time. So split the seqlock into a lock and a seqcount. Open code the seqwrite_lock in the places which matter and drop the sequence counter update where it's pointless. Signed-off-by: Thomas Gleixner [jstultz: Merge fixups from CLOCK_TAI collisions] Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 132 +++++++++++++++++++++++++--------------------- 1 file changed, 73 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index caede71..5e048e0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -26,7 +26,8 @@ #include "tick-internal.h" static struct timekeeper timekeeper; -static DEFINE_SEQLOCK(timekeeper_lock); +static DEFINE_RAW_SPINLOCK(timekeeper_lock); +static seqcount_t timekeeper_seq; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -204,8 +205,6 @@ static void update_pvclock_gtod(struct timekeeper *tk) /** * pvclock_gtod_register_notifier - register a pvclock timedata update listener - * - * Must hold write on timekeeper.lock */ int pvclock_gtod_register_notifier(struct notifier_block *nb) { @@ -213,11 +212,10 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb) unsigned long flags; int ret; - write_seqlock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); - /* update timekeeping data */ update_pvclock_gtod(tk); - write_sequnlock_irqrestore(&timekeeper_lock, flags); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; } @@ -226,23 +224,21 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); /** * pvclock_gtod_unregister_notifier - unregister a pvclock * timedata update listener - * - * Must hold write on timekeeper.lock */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { unsigned long flags; int ret; - write_seqlock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); - write_sequnlock_irqrestore(&timekeeper_lock, flags); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); -/* must hold write on timekeeper.lock */ +/* must hold timekeeper_lock */ static void timekeeping_update(struct timekeeper *tk, bool clearntp) { if (clearntp) { @@ -296,12 +292,12 @@ int __getnstimeofday(struct timespec *ts) s64 nsecs = 0; do { - seq = read_seqbegin(&timekeeper_lock); + seq = read_seqcount_begin(&timekeeper_seq); ts->tv_sec = tk->xtime_sec; nsecs = timekeeping_get_ns(tk); - } while (read_seqretry(&timekeeper_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); ts->tv_nsec = 0; timespec_add_ns(ts, nsecs); @@ -337,11 +333,11 @@ ktime_t ktime_get(void) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper_lock); + seq = read_seqcount_begin(&timekeeper_seq); secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; - } while (read_seqretry(&timekeeper_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); /* * Use ktime_set/ktime_add_ns to create a proper ktime on * 32-bit architectures without CONFIG_KTIME_SCALAR. @@ -368,12 +364,12 @@ void ktime_get_ts(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper_lock); + seq = read_seqcount_begin(&timekeeper_seq); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; - } while (read_seqretry(&timekeeper_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); ts->tv_sec += tomono.tv_sec; ts->tv_nsec = 0; @@ -397,12 +393,12 @@ void timekeeping_clocktai(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper_lock); + seq = read_seqcount_begin(&timekeeper_seq); ts->tv_sec = tk->xtime_sec + tk->tai_offset; nsecs = timekeeping_get_ns(tk); - } while (read_seqretry(&timekeeper_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); ts->tv_nsec = 0; timespec_add_ns(ts, nsecs); @@ -445,7 +441,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) WARN_ON_ONCE(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper_lock); + seq = read_seqcount_begin(&timekeeper_seq); *ts_raw = tk->raw_time; ts_real->tv_sec = tk->xtime_sec; @@ -454,7 +450,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) nsecs_raw = timekeeping_get_ns_raw(tk); nsecs_real = timekeeping_get_ns(tk); - } while (read_seqretry(&timekeeper_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); timespec_add_ns(ts_raw, nsecs_raw); timespec_add_ns(ts_real, nsecs_real); @@ -494,7 +490,8 @@ int do_settimeofday(const struct timespec *tv) if (!timespec_valid_strict(tv)) return -EINVAL; - write_seqlock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); @@ -508,7 +505,8 @@ int do_settimeofday(const struct timespec *tv) timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper_lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -533,7 +531,8 @@ int timekeeping_inject_offset(struct timespec *ts) if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); @@ -550,7 +549,8 @@ int timekeeping_inject_offset(struct timespec *ts) error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper_lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -571,9 +571,9 @@ s32 timekeeping_get_tai_offset(void) s32 ret; do { - seq = read_seqbegin(&timekeeper_lock); + seq = read_seqcount_begin(&timekeeper_seq); ret = tk->tai_offset; - } while (read_seqretry(&timekeeper_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return ret; } @@ -597,9 +597,11 @@ void timekeeping_set_tai_offset(s32 tai_offset) struct timekeeper *tk = &timekeeper; unsigned long flags; - write_seqlock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); __timekeeping_set_tai_offset(tk, tai_offset); - write_sequnlock_irqrestore(&timekeeper_lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } /** @@ -615,7 +617,8 @@ static int change_clocksource(void *data) new = (struct clocksource *) data; - write_seqlock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); if (!new->enable || new->enable(new) == 0) { @@ -626,7 +629,8 @@ static int change_clocksource(void *data) } timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper_lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return 0; } @@ -676,11 +680,11 @@ void getrawmonotonic(struct timespec *ts) s64 nsecs; do { - seq = read_seqbegin(&timekeeper_lock); + seq = read_seqcount_begin(&timekeeper_seq); nsecs = timekeeping_get_ns_raw(tk); *ts = tk->raw_time; - } while (read_seqretry(&timekeeper_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); timespec_add_ns(ts, nsecs); } @@ -696,11 +700,11 @@ int timekeeping_valid_for_hres(void) int ret; do { - seq = read_seqbegin(&timekeeper_lock); + seq = read_seqcount_begin(&timekeeper_seq); ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqretry(&timekeeper_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return ret; } @@ -715,11 +719,11 @@ u64 timekeeping_max_deferment(void) u64 ret; do { - seq = read_seqbegin(&timekeeper_lock); + seq = read_seqcount_begin(&timekeeper_seq); ret = tk->clock->max_idle_ns; - } while (read_seqretry(&timekeeper_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return ret; } @@ -784,7 +788,8 @@ void __init timekeeping_init(void) ntp_init(); - write_seqlock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); @@ -803,7 +808,8 @@ void __init timekeeping_init(void) tmp.tv_nsec = 0; tk_set_sleep_time(tk, tmp); - write_sequnlock_irqrestore(&timekeeper_lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } /* time in seconds when suspend began */ @@ -851,7 +857,8 @@ void timekeeping_inject_sleeptime(struct timespec *delta) if (has_persistent_clock()) return; - write_seqlock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); @@ -859,7 +866,8 @@ void timekeeping_inject_sleeptime(struct timespec *delta) timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper_lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -886,7 +894,8 @@ static void timekeeping_resume(void) clockevents_resume(); clocksource_resume(); - write_seqlock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); /* * After system resumes, we need to calculate the suspended time and @@ -938,7 +947,8 @@ static void timekeeping_resume(void) tk->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update(tk, false); - write_sequnlock_irqrestore(&timekeeper_lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); touch_softlockup_watchdog(); @@ -957,7 +967,8 @@ static int timekeeping_suspend(void) read_persistent_clock(&timekeeping_suspend_time); - write_seqlock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); timekeeping_suspended = 1; @@ -980,7 +991,8 @@ static int timekeeping_suspend(void) timekeeping_suspend_time = timespec_add(timekeeping_suspend_time, delta_delta); } - write_sequnlock_irqrestore(&timekeeper_lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); clocksource_suspend(); @@ -1320,7 +1332,8 @@ static void update_wall_time(void) int shift = 0, maxshift; unsigned long flags; - write_seqlock_irqsave(&timekeeper_lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) @@ -1375,7 +1388,8 @@ static void update_wall_time(void) timekeeping_update(tk, false); out: - write_sequnlock_irqrestore(&timekeeper_lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } @@ -1423,13 +1437,13 @@ void get_monotonic_boottime(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper_lock); + seq = read_seqcount_begin(&timekeeper_seq); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; sleep = tk->total_sleep_time; - } while (read_seqretry(&timekeeper_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); ts->tv_sec += tomono.tv_sec + sleep.tv_sec; ts->tv_nsec = 0; @@ -1488,10 +1502,10 @@ struct timespec current_kernel_time(void) unsigned long seq; do { - seq = read_seqbegin(&timekeeper_lock); + seq = read_seqcount_begin(&timekeeper_seq); now = tk_xtime(tk); - } while (read_seqretry(&timekeeper_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return now; } @@ -1504,11 +1518,11 @@ struct timespec get_monotonic_coarse(void) unsigned long seq; do { - seq = read_seqbegin(&timekeeper_lock); + seq = read_seqcount_begin(&timekeeper_seq); now = tk_xtime(tk); mono = tk->wall_to_monotonic; - } while (read_seqretry(&timekeeper_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); @@ -1539,11 +1553,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, unsigned long seq; do { - seq = read_seqbegin(&timekeeper_lock); + seq = read_seqcount_begin(&timekeeper_seq); *xtim = tk_xtime(tk); *wtom = tk->wall_to_monotonic; *sleep = tk->total_sleep_time; - } while (read_seqretry(&timekeeper_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); } #ifdef CONFIG_HIGH_RES_TIMERS @@ -1564,7 +1578,7 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, u64 secs, nsecs; do { - seq = read_seqbegin(&timekeeper_lock); + seq = read_seqcount_begin(&timekeeper_seq); secs = tk->xtime_sec; nsecs = timekeeping_get_ns(tk); @@ -1572,7 +1586,7 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; - } while (read_seqretry(&timekeeper_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); now = ktime_add_ns(ktime_set(secs, 0), nsecs); now = ktime_sub(now, *offs_real); @@ -1590,9 +1604,9 @@ ktime_t ktime_get_monotonic_offset(void) struct timespec wtom; do { - seq = read_seqbegin(&timekeeper_lock); + seq = read_seqcount_begin(&timekeeper_seq); wtom = tk->wall_to_monotonic; - } while (read_seqretry(&timekeeper_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return timespec_to_ktime(wtom); } -- cgit v1.1 From 8ffbc7d9ff0064ce000788272a609fa79d06ead1 Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 13 Mar 2013 12:20:38 -0700 Subject: hrtimer/trivial: Fix comment text in hrtimer.c The comments mention HRTIMER_ABS and HRTIMER_REL, these symbols don't exist, the proper names are HRTIMER_MODE_ABS and HRTIMER_MODE_REL. Signed-off-by: David Daney Cc: Jiri Kosina Link: http://lkml.kernel.org/r/1363202438-21234-1-git-send-email-ddaney.cavm@gmail.com Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index cc47812..031184e 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1010,7 +1010,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) * * Returns: * 0 on success @@ -1027,7 +1028,8 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); * hrtimer_start - (re)start an hrtimer on the current CPU * @timer: the timer to be added * @tim: expiry time - * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) * * Returns: * 0 on success -- cgit v1.1 From cfea7d7e452f57682a0bb55a55e9f79c569558c2 Mon Sep 17 00:00:00 2001 From: Rado Vrbovsky Date: Fri, 8 Feb 2013 12:37:30 -0500 Subject: tick: Change log level of NOHZ: local_softirq_pending message The "NOHZ: local_softirq_pending" message is a largely informational message. This makes extra work for customers that have a policy of investigating all kernel log messages logged at <= KERN_ERR log level. This patch sets the message to a different log level. [ tglx: Use pr_warn() ] Signed-off-by: Rado Vrbovsky Cc: Don Zickus Link: http://lkml.kernel.org/r/2037057938.893524.1360345050772.JavaMail.root@redhat.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a19a399..225f8bf 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -482,8 +482,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (ratelimit < 10 && (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - (unsigned int) local_softirq_pending()); + pr_warn("NOHZ: local_softirq_pending %02x\n", + (unsigned int) local_softirq_pending()); ratelimit++; } return false; -- cgit v1.1 From dd5d70e869f960bde6376f4447fff59f16186cf5 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Mon, 25 Mar 2013 12:24:24 -0700 Subject: timekeeping: __timekeeping_set_tai_offset can be static Yet again, the kbuild test robot saves the day, noting I left out defining __timekeeping_set_tai_offset as static. It even sent me this patch. Reported-by: Fengguang Wu Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5e048e0..c5feb7a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -582,7 +582,7 @@ s32 timekeeping_get_tai_offset(void) * __timekeeping_set_tai_offset - Lock free worker function * */ -void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) +static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) { tk->tai_offset = tai_offset; tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0)); -- cgit v1.1 From ad460967a2953496ad76b1c22091ea99f21b4e86 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 22 Mar 2013 11:59:04 -0700 Subject: ntp: Split out timex validation from do_adjtimex Split out the timex validation done in do_adjtimex into a separate function. This will help simplify logic in following patches. Cc: Thomas Gleixner Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: John Stultz --- kernel/time/ntp.c | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 59e2749..457d2ba 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -622,17 +622,13 @@ static inline void process_adjtimex_modes(struct timex *txc, ntp_update_frequency(); } -/* - * adjtimex mainly allows reading (and writing, if superuser) of - * kernel time-keeping variables. used by xntpd. + + +/** + * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -int do_adjtimex(struct timex *txc) +int ntp_validate_timex(struct timex *txc) { - struct timespec ts; - u32 time_tai, orig_tai; - int result; - - /* Validate the data before disabling interrupts */ if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) @@ -644,7 +640,6 @@ int do_adjtimex(struct timex *txc) /* In order to modify anything, you gotta be super-user! */ if (txc->modes && !capable(CAP_SYS_TIME)) return -EPERM; - /* * if the quartz is off by more than 10% then * something is VERY wrong! @@ -655,12 +650,32 @@ int do_adjtimex(struct timex *txc) return -EINVAL; } + if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) + return -EPERM; + + return 0; +} + + +/* + * adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ +int do_adjtimex(struct timex *txc) +{ + struct timespec ts; + u32 time_tai, orig_tai; + int result; + + /* Validate the data before disabling interrupts */ + result = ntp_validate_timex(txc); + if (result) + return result; + if (txc->modes & ADJ_SETOFFSET) { struct timespec delta; delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; - if (!capable(CAP_SYS_TIME)) - return -EPERM; if (!(txc->modes & ADJ_NANO)) delta.tv_nsec *= 1000; result = timekeeping_inject_offset(&delta); -- cgit v1.1 From aa6f9c595d857328e5d815e5b94c0e7cd31a6b59 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 22 Mar 2013 11:31:29 -0700 Subject: ntp: Move do_adjtimex() and hardpps() functions to timekeeping.c In preparation for changing the ntp locking rules, move do_adjtimex and hardpps accessor functions to timekeeping.c, but keep the code logic in ntp.c. This patch also introduces a ntp_internal.h file so timekeeping specific interfaces of ntp.c can be more limitedly shared with timekeeping.c. Cc: Thomas Gleixner Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: John Stultz --- kernel/time/ntp.c | 9 ++++----- kernel/time/ntp_internal.h | 11 +++++++++++ kernel/time/timekeeping.c | 21 +++++++++++++++++++++ 3 files changed, 36 insertions(+), 5 deletions(-) create mode 100644 kernel/time/ntp_internal.h (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 457d2ba..8b10706 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -18,6 +18,7 @@ #include #include "tick-internal.h" +#include "ntp_internal.h" /* * NTP timekeeping variables: @@ -661,7 +662,7 @@ int ntp_validate_timex(struct timex *txc) * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int do_adjtimex(struct timex *txc) +int __do_adjtimex(struct timex *txc) { struct timespec ts; u32 time_tai, orig_tai; @@ -911,7 +912,7 @@ static void hardpps_update_phase(long error) } /* - * hardpps() - discipline CPU clock oscillator to external PPS signal + * __hardpps() - discipline CPU clock oscillator to external PPS signal * * This routine is called at each PPS signal arrival in order to * discipline the CPU clock oscillator to the PPS signal. It takes two @@ -922,7 +923,7 @@ static void hardpps_update_phase(long error) * This code is based on David Mills's reference nanokernel * implementation. It was mostly rewritten but keeps the same idea. */ -void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) { struct pps_normtime pts_norm, freq_norm; unsigned long flags; @@ -976,8 +977,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) raw_spin_unlock_irqrestore(&ntp_lock, flags); } -EXPORT_SYMBOL(hardpps); - #endif /* CONFIG_NTP_PPS */ static int __init ntp_tick_adj_setup(char *str) diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h new file mode 100644 index 0000000..fdee80c --- /dev/null +++ b/kernel/time/ntp_internal.h @@ -0,0 +1,11 @@ +#ifndef _LINUX_NTP_INTERNAL_H +#define _LINUX_NTP_INTERNAL_H + +extern void ntp_init(void); +extern void ntp_clear(void); +/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ +extern u64 ntp_tick_length(void); +extern int second_overflow(unsigned long secs); +extern int __do_adjtimex(struct timex *); +extern void __hardpps(const struct timespec *, const struct timespec *); +#endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c5feb7a..a138ec2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -24,6 +24,7 @@ #include #include "tick-internal.h" +#include "ntp_internal.h" static struct timekeeper timekeeper; static DEFINE_RAW_SPINLOCK(timekeeper_lock); @@ -1613,6 +1614,26 @@ ktime_t ktime_get_monotonic_offset(void) EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); /** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + */ +int do_adjtimex(struct timex *txc) +{ + return __do_adjtimex(txc); +} + + +#ifdef CONFIG_NTP_PPS +/** + * hardpps() - Accessor function to NTP __hardpps function + */ +void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +{ + __hardpps(phase_ts, raw_ts); +} +EXPORT_SYMBOL(hardpps); +#endif + +/** * xtime_update() - advances the timekeeping infrastructure * @ticks: number of ticks, that have elapsed since the last call. * -- cgit v1.1 From e4085693f629ded8ac8c35b5cdd324d20242990b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 22 Mar 2013 12:08:52 -0700 Subject: ntp: Move timex validation to timekeeping do_adjtimex call. Move logic that does not need the ntp state to be done in the timekeeping do_adjtimex() call. Cc: Thomas Gleixner Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: John Stultz --- kernel/time/ntp.c | 5 ----- kernel/time/ntp_internal.h | 1 + kernel/time/timekeeping.c | 7 +++++++ 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8b10706..2dc60c6 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -668,11 +668,6 @@ int __do_adjtimex(struct timex *txc) u32 time_tai, orig_tai; int result; - /* Validate the data before disabling interrupts */ - result = ntp_validate_timex(txc); - if (result) - return result; - if (txc->modes & ADJ_SETOFFSET) { struct timespec delta; delta.tv_sec = txc->time.tv_sec; diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index fdee80c..a2a3976 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -6,6 +6,7 @@ extern void ntp_clear(void); /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ extern u64 ntp_tick_length(void); extern int second_overflow(unsigned long secs); +extern int ntp_validate_timex(struct timex *); extern int __do_adjtimex(struct timex *); extern void __hardpps(const struct timespec *, const struct timespec *); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a138ec2..f6c8a72 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1618,6 +1618,13 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); */ int do_adjtimex(struct timex *txc) { + int ret; + + /* Validate the data before disabling interrupts */ + ret = ntp_validate_timex(txc); + if (ret) + return ret; + return __do_adjtimex(txc); } -- cgit v1.1 From 87ace39b7168bd9d352c1c52b6f5d88eb1876cf8 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 22 Mar 2013 12:28:15 -0700 Subject: ntp: Rework do_adjtimex to take timespec and tai arguments In order to change the locking rules, we need to provide the timespec and tai values rather then having the ntp logic acquire these values itself. Cc: Thomas Gleixner Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: John Stultz --- kernel/time/ntp.c | 18 +++++------------- kernel/time/ntp_internal.h | 2 +- kernel/time/timekeeping.c | 13 +++++++++++-- 3 files changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 2dc60c6..d17e13c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -662,10 +662,8 @@ int ntp_validate_timex(struct timex *txc) * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(struct timex *txc) +int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) { - struct timespec ts; - u32 time_tai, orig_tai; int result; if (txc->modes & ADJ_SETOFFSET) { @@ -679,9 +677,6 @@ int __do_adjtimex(struct timex *txc) return result; } - getnstimeofday(&ts); - orig_tai = time_tai = timekeeping_get_tai_offset(); - raw_spin_lock_irq(&ntp_lock); if (txc->modes & ADJ_ADJTIME) { @@ -697,7 +692,7 @@ int __do_adjtimex(struct timex *txc) /* If there are input parameters, then process them: */ if (txc->modes) - process_adjtimex_modes(txc, &ts, &time_tai); + process_adjtimex_modes(txc, ts, time_tai); txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); @@ -719,18 +714,15 @@ int __do_adjtimex(struct timex *txc) txc->precision = 1; txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; txc->tick = tick_usec; - txc->tai = time_tai; + txc->tai = *time_tai; /* fill PPS status fields */ pps_fill_timex(txc); raw_spin_unlock_irq(&ntp_lock); - if (time_tai != orig_tai) - timekeeping_set_tai_offset(time_tai); - - txc->time.tv_sec = ts.tv_sec; - txc->time.tv_usec = ts.tv_nsec; + txc->time.tv_sec = ts->tv_sec; + txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index a2a3976..1950cb4 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -7,6 +7,6 @@ extern void ntp_clear(void); extern u64 ntp_tick_length(void); extern int second_overflow(unsigned long secs); extern int ntp_validate_timex(struct timex *); -extern int __do_adjtimex(struct timex *); +extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); extern void __hardpps(const struct timespec *, const struct timespec *); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f6c8a72..5f7a233 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1618,6 +1618,8 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); */ int do_adjtimex(struct timex *txc) { + struct timespec ts; + s32 tai, orig_tai; int ret; /* Validate the data before disabling interrupts */ @@ -1625,9 +1627,16 @@ int do_adjtimex(struct timex *txc) if (ret) return ret; - return __do_adjtimex(txc); -} + getnstimeofday(&ts); + orig_tai = tai = timekeeping_get_tai_offset(); + + ret = __do_adjtimex(txc, &ts, &tai); + if (tai != orig_tai) + timekeeping_set_tai_offset(tai); + + return ret; +} #ifdef CONFIG_NTP_PPS /** -- cgit v1.1 From cef90377fab488bd1f959efda178fb83250cf61d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 22 Mar 2013 15:04:13 -0700 Subject: timekeeping: Move ADJ_SETOFFSET to top level do_adjtimex() Since ADJ_SETOFFSET adjusts the timekeeping state, process it as part of the top level do_adjtimex() function in timekeeping.c. This avoids deadlocks that could occur once we change the ntp locking rules. Cc: Thomas Gleixner Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: John Stultz --- kernel/time/ntp.c | 11 ----------- kernel/time/timekeeping.c | 11 +++++++++++ 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index d17e13c..a331ebc 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -666,17 +666,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) { int result; - if (txc->modes & ADJ_SETOFFSET) { - struct timespec delta; - delta.tv_sec = txc->time.tv_sec; - delta.tv_nsec = txc->time.tv_usec; - if (!(txc->modes & ADJ_NANO)) - delta.tv_nsec *= 1000; - result = timekeeping_inject_offset(&delta); - if (result) - return result; - } - raw_spin_lock_irq(&ntp_lock); if (txc->modes & ADJ_ADJTIME) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5f7a233..e44915c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1627,6 +1627,17 @@ int do_adjtimex(struct timex *txc) if (ret) return ret; + if (txc->modes & ADJ_SETOFFSET) { + struct timespec delta; + delta.tv_sec = txc->time.tv_sec; + delta.tv_nsec = txc->time.tv_usec; + if (!(txc->modes & ADJ_NANO)) + delta.tv_nsec *= 1000; + ret = timekeeping_inject_offset(&delta); + if (ret) + return ret; + } + getnstimeofday(&ts); orig_tai = tai = timekeeping_get_tai_offset(); -- cgit v1.1 From 06c017fdd4dc48451a29ac37fc1db4a3f86b7f40 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 22 Mar 2013 11:37:28 -0700 Subject: timekeeping: Hold timekeepering locks in do_adjtimex and hardpps In moving the NTP state to be protected by the timekeeping locks, be sure to acquire the timekeeping locks prior to calling ntp functions. Cc: Thomas Gleixner Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e44915c..d10bd73 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -787,10 +787,10 @@ void __init timekeeping_init(void) boot.tv_nsec = 0; } - ntp_init(); - raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&timekeeper_seq); + ntp_init(); + clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); @@ -1618,6 +1618,7 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); */ int do_adjtimex(struct timex *txc) { + unsigned long flags; struct timespec ts; s32 tai, orig_tai; int ret; @@ -1641,8 +1642,14 @@ int do_adjtimex(struct timex *txc) getnstimeofday(&ts); orig_tai = tai = timekeeping_get_tai_offset(); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + ret = __do_adjtimex(txc, &ts, &tai); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + if (tai != orig_tai) timekeeping_set_tai_offset(tai); @@ -1655,7 +1662,15 @@ int do_adjtimex(struct timex *txc) */ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) { + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + __hardpps(phase_ts, raw_ts); + + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); #endif -- cgit v1.1 From 0b5154fb9040cca94e7d9893384c0e78bfe2d296 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 22 Mar 2013 14:20:03 -0700 Subject: timekeeping: Simplify tai updating from do_adjtimex Since we are taking the timekeeping locks, just go ahead and update any tai change directly, rather then dropping the lock and calling a function that will just take it again. Cc: Thomas Gleixner Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d10bd73..f93f60c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1618,9 +1618,10 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); */ int do_adjtimex(struct timex *txc) { + struct timekeeper *tk = &timekeeper; unsigned long flags; struct timespec ts; - s32 tai, orig_tai; + s32 tai; int ret; /* Validate the data before disabling interrupts */ @@ -1640,19 +1641,17 @@ int do_adjtimex(struct timex *txc) } getnstimeofday(&ts); - orig_tai = tai = timekeeping_get_tai_offset(); raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&timekeeper_seq); + tai = tk->tai_offset; ret = __do_adjtimex(txc, &ts, &tai); + __timekeeping_set_tai_offset(tk, tai); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - if (tai != orig_tai) - timekeeping_set_tai_offset(tai); - return ret; } -- cgit v1.1 From a076b2146fabb0894cae5e0189a8ba3f1502d737 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 22 Mar 2013 11:52:03 -0700 Subject: ntp: Remove ntp_lock, using the timekeeping locks to protect ntp state In order to properly handle the NTP state in future changes to the timekeeping lock management, this patch moves the management of all of the ntp state under the timekeeping locks. This allows us to remove the ntp_lock. Cc: Thomas Gleixner Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: John Stultz --- kernel/time/ntp.c | 41 ++++------------------------------------- 1 file changed, 4 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index a331ebc..12ff13a 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -22,10 +22,10 @@ /* * NTP timekeeping variables: + * + * Note: All of the NTP state is protected by the timekeeping locks. */ -DEFINE_RAW_SPINLOCK(ntp_lock); - /* USER_HZ period (usecs): */ unsigned long tick_usec = TICK_USEC; @@ -132,8 +132,6 @@ static inline void pps_reset_freq_interval(void) /** * pps_clear - Clears the PPS state variables - * - * Must be called while holding a write on the ntp_lock */ static inline void pps_clear(void) { @@ -148,8 +146,6 @@ static inline void pps_clear(void) /* Decrease pps_valid to indicate that another second has passed since * the last PPS signal. When it reaches 0, indicate that PPS signal is * missing. - * - * Must be called while holding a write on the ntp_lock */ static inline void pps_dec_valid(void) { @@ -344,10 +340,6 @@ static void ntp_update_offset(long offset) */ void ntp_clear(void) { - unsigned long flags; - - raw_spin_lock_irqsave(&ntp_lock, flags); - time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; @@ -360,20 +352,12 @@ void ntp_clear(void) /* Clear PPS state variables */ pps_clear(); - raw_spin_unlock_irqrestore(&ntp_lock, flags); - } u64 ntp_tick_length(void) { - unsigned long flags; - s64 ret; - - raw_spin_lock_irqsave(&ntp_lock, flags); - ret = tick_length; - raw_spin_unlock_irqrestore(&ntp_lock, flags); - return ret; + return tick_length; } @@ -391,9 +375,6 @@ int second_overflow(unsigned long secs) { s64 delta; int leap = 0; - unsigned long flags; - - raw_spin_lock_irqsave(&ntp_lock, flags); /* * Leap second processing. If in leap-insert state at the end of the @@ -475,8 +456,6 @@ int second_overflow(unsigned long secs) time_adjust = 0; out: - raw_spin_unlock_irqrestore(&ntp_lock, flags); - return leap; } @@ -571,10 +550,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) time_status |= txc->status & ~STA_RONLY; } -/* - * Called with ntp_lock held, so we can access and modify - * all the global NTP state: - */ + static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts, s32 *time_tai) @@ -666,8 +642,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) { int result; - raw_spin_lock_irq(&ntp_lock); - if (txc->modes & ADJ_ADJTIME) { long save_adjust = time_adjust; @@ -708,8 +682,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) /* fill PPS status fields */ pps_fill_timex(txc); - raw_spin_unlock_irq(&ntp_lock); - txc->time.tv_sec = ts->tv_sec; txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) @@ -906,8 +878,6 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) pts_norm = pps_normalize_ts(*phase_ts); - raw_spin_lock_irqsave(&ntp_lock, flags); - /* clear the error bits, they will be set again if needed */ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); @@ -919,7 +889,6 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) * just start the frequency interval */ if (unlikely(pps_fbase.tv_sec == 0)) { pps_fbase = *raw_ts; - raw_spin_unlock_irqrestore(&ntp_lock, flags); return; } @@ -934,7 +903,6 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; - raw_spin_unlock_irqrestore(&ntp_lock, flags); pr_err("hardpps: PPSJITTER: bad pulse\n"); return; } @@ -951,7 +919,6 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) hardpps_update_phase(pts_norm.nsec); - raw_spin_unlock_irqrestore(&ntp_lock, flags); } #endif /* CONFIG_NTP_PPS */ -- cgit v1.1 From 14a3b6abe98c8f53a13522610c257accef7321df Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Feb 2013 22:51:38 +0000 Subject: timekeeping: Store cycle_last value in timekeeper struct as well For implementing a shadow timekeeper and a split calculation/update region we need to store the cycle_last value in the timekeeper and update the value in the clocksource struct only in the update region. Add the extra storage to the timekeeper. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f93f60c..4c276b2d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -101,7 +101,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) old_clock = tk->clock; tk->clock = clock; - clock->cycle_last = clock->read(clock); + tk->cycle_last = clock->cycle_last = clock->read(clock); /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; @@ -266,7 +266,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) clock = tk->clock; cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - clock->cycle_last = cycle_now; + tk->cycle_last = clock->cycle_last = cycle_now; tk->xtime_nsec += cycle_delta * tk->mult; -- cgit v1.1 From 7ec98e15aa049b7a2ca73485f31cf4f90c34e2dd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Feb 2013 22:51:39 +0000 Subject: timekeeping: Delay update of clock->cycle_last For calculating the new timekeeper values store the new cycle_last value in the timekeeper and update the clock->cycle_last just when we actually update the new values. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4c276b2d..38ac782 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1271,7 +1271,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, /* Accumulate one shifted interval */ offset -= interval; - tk->clock->cycle_last += interval; + tk->cycle_last += interval; tk->xtime_nsec += tk->xtime_interval << shift; accumulate_nsecs_to_secs(tk); @@ -1386,6 +1386,8 @@ static void update_wall_time(void) */ accumulate_nsecs_to_secs(tk); + /* Update clock->cycle_last with the new value */ + clock->cycle_last = tk->cycle_last; timekeeping_update(tk, false); out: -- cgit v1.1 From 48cdc135d4840aab8efd2fc3bacb5d7dfd94a9c8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Feb 2013 22:51:40 +0000 Subject: timekeeping: Implement a shadow timekeeper Use the shadow timekeeper to do the update_wall_time() adjustments and then copy it over to the real timekeeper. Keep the shadow timekeeper in sync when updating stuff outside of update_wall_time(). This allows us to limit the timekeeper_seq hold time to the update of the real timekeeper and the vsyscall data in the next patch. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 38ac782..d20ffda 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -29,6 +29,7 @@ static struct timekeeper timekeeper; static DEFINE_RAW_SPINLOCK(timekeeper_lock); static seqcount_t timekeeper_seq; +static struct timekeeper shadow_timekeeper; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -240,7 +241,7 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); /* must hold timekeeper_lock */ -static void timekeeping_update(struct timekeeper *tk, bool clearntp) +static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror) { if (clearntp) { tk->ntp_error = 0; @@ -248,6 +249,9 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp) } update_vsyscall(tk); update_pvclock_gtod(tk); + + if (mirror) + memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); } /** @@ -504,7 +508,7 @@ int do_settimeofday(const struct timespec *tv) tk_set_xtime(tk, tv); - timekeeping_update(tk, true); + timekeeping_update(tk, true, true); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -548,7 +552,7 @@ int timekeeping_inject_offset(struct timespec *ts) tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); error: /* even if we error out, we forwarded the time, so call update */ - timekeeping_update(tk, true); + timekeeping_update(tk, true, true); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -628,7 +632,7 @@ static int change_clocksource(void *data) if (old->disable) old->disable(old); } - timekeeping_update(tk, true); + timekeeping_update(tk, true, true); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -809,6 +813,8 @@ void __init timekeeping_init(void) tmp.tv_nsec = 0; tk_set_sleep_time(tk, tmp); + memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); + write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } @@ -865,7 +871,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) __timekeeping_inject_sleeptime(tk, delta); - timekeeping_update(tk, true); + timekeeping_update(tk, true, true); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -947,7 +953,7 @@ static void timekeeping_resume(void) clock->cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(tk, false); + timekeeping_update(tk, false, true); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1328,7 +1334,8 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) static void update_wall_time(void) { struct clocksource *clock; - struct timekeeper *tk = &timekeeper; + struct timekeeper *real_tk = &timekeeper; + struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; int shift = 0, maxshift; unsigned long flags; @@ -1340,16 +1347,16 @@ static void update_wall_time(void) if (unlikely(timekeeping_suspended)) goto out; - clock = tk->clock; + clock = real_tk->clock; #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET - offset = tk->cycle_interval; + offset = real_tk->cycle_interval; #else offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif /* Check if there's really nothing to do */ - if (offset < tk->cycle_interval) + if (offset < real_tk->cycle_interval) goto out; /* @@ -1388,12 +1395,22 @@ static void update_wall_time(void) /* Update clock->cycle_last with the new value */ clock->cycle_last = tk->cycle_last; - timekeeping_update(tk, false); + /* + * Update the real timekeeper. + * + * We could avoid this memcpy by switching pointers, but that + * requires changes to all other timekeeper usage sites as + * well, i.e. move the timekeeper pointer getter into the + * spinlocked/seqcount protected sections. And we trade this + * memcpy under the timekeeper_seq against one before we start + * updating. + */ + memcpy(real_tk, tk, sizeof(*tk)); + timekeeping_update(real_tk, false, false); out: write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - } /** -- cgit v1.1 From ca4523cda429712fc135c5db50920d90eb776a6c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Feb 2013 22:51:40 +0000 Subject: timekeeping: Shorten seq_count region Shorten the seqcount write hold region to the actual update of the timekeeper and the related data (e.g vsyscall). On a contemporary x86 system this reduces the maximum latencies on Preempt-RT from 8us to 4us on the non-timekeeping cores. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d20ffda..c4d2a87 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1341,7 +1341,6 @@ static void update_wall_time(void) unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) @@ -1393,6 +1392,7 @@ static void update_wall_time(void) */ accumulate_nsecs_to_secs(tk); + write_seqcount_begin(&timekeeper_seq); /* Update clock->cycle_last with the new value */ clock->cycle_last = tk->cycle_last; /* @@ -1407,9 +1407,8 @@ static void update_wall_time(void) */ memcpy(real_tk, tk, sizeof(*tk)); timekeeping_update(real_tk, false, false); - -out: write_seqcount_end(&timekeeper_seq); +out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } -- cgit v1.1 From 8f294b5a139ee4b75e890ad5b443c93d1e558a8b Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 8 Apr 2013 08:47:15 -0400 Subject: hrtimer: Add expiry time overflow check in hrtimer_interrupt The settimeofday01 test in the LTP testsuite effectively does gettimeofday(current time); settimeofday(Jan 1, 1970 + 100 seconds); settimeofday(current time); This test causes a stack trace to be displayed on the console during the setting of timeofday to Jan 1, 1970 + 100 seconds: [ 131.066751] ------------[ cut here ]------------ [ 131.096448] WARNING: at kernel/time/clockevents.c:209 clockevents_program_event+0x135/0x140() [ 131.104935] Hardware name: Dinar [ 131.108150] Modules linked in: sg nfsv3 nfs_acl nfsv4 auth_rpcgss nfs dns_resolver fscache lockd sunrpc nf_conntrack_netbios_ns nf_conntrack_broadcast ipt_MASQUERADE ip6table_mangle ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 iptable_nat nf_nat_ipv4 nf_nat iptable_mangle ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter ip_tables kvm_amd kvm sp5100_tco bnx2 i2c_piix4 crc32c_intel k10temp fam15h_power ghash_clmulni_intel amd64_edac_mod pcspkr serio_raw edac_mce_amd edac_core microcode xfs libcrc32c sr_mod sd_mod cdrom ata_generic crc_t10dif pata_acpi radeon i2c_algo_bit drm_kms_helper ttm drm ahci pata_atiixp libahci libata usb_storage i2c_core dm_mirror dm_region_hash dm_log dm_mod [ 131.176784] Pid: 0, comm: swapper/28 Not tainted 3.8.0+ #6 [ 131.182248] Call Trace: [ 131.184684] [] warn_slowpath_common+0x7f/0xc0 [ 131.191312] [] warn_slowpath_null+0x1a/0x20 [ 131.197131] [] clockevents_program_event+0x135/0x140 [ 131.203721] [] tick_program_event+0x24/0x30 [ 131.209534] [] hrtimer_interrupt+0x131/0x230 [ 131.215437] [] ? cpufreq_p4_target+0x130/0x130 [ 131.221509] [] smp_apic_timer_interrupt+0x69/0x99 [ 131.227839] [] apic_timer_interrupt+0x6d/0x80 [ 131.233816] [] ? sched_clock_cpu+0xc5/0x120 [ 131.240267] [] ? cpuidle_wrap_enter+0x50/0xa0 [ 131.246252] [] ? cpuidle_wrap_enter+0x49/0xa0 [ 131.252238] [] cpuidle_enter_tk+0x10/0x20 [ 131.257877] [] cpuidle_idle_call+0xa9/0x260 [ 131.263692] [] cpu_idle+0xaf/0x120 [ 131.268727] [] start_secondary+0x255/0x257 [ 131.274449] ---[ end trace 1151a50552231615 ]--- When we change the system time to a low value like this, the value of timekeeper->offs_real will be a negative value. It seems that the WARN occurs because an hrtimer has been started in the time between the releasing of the timekeeper lock and the IPI call (via a call to on_each_cpu) in clock_was_set() in the do_settimeofday() code. The end result is that a REALTIME_CLOCK timer has been added with softexpires = expires = KTIME_MAX. The hrtimer_interrupt() fires/is called and the loop at kernel/hrtimer.c:1289 is executed. In this loop the code subtracts the clock base's offset (which was set to timekeeper->offs_real in do_settimeofday()) from the current hrtimer_cpu_base->expiry value (which was KTIME_MAX): KTIME_MAX - (a negative value) = overflow A simple check for an overflow can resolve this problem. Using KTIME_MAX instead of the overflow value will result in the hrtimer function being run, and the reprogramming of the timer after that. Cc: Thomas Gleixner Cc: stable@vger.kernel.org Reviewed-by: Rik van Riel Signed-off-by: Prarit Bhargava [jstultz: Tweaked commit subject] Signed-off-by: John Stultz --- kernel/hrtimer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index d6830d5..071e093 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1323,6 +1323,8 @@ retry: expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + if (expires.tv64 < 0) + expires.tv64 = KTIME_MAX; if (expires.tv64 < expires_next.tv64) expires_next = expires; break; -- cgit v1.1 From 51fd36f3fad8447c487137ae26b9d0b3ce77bb25 Mon Sep 17 00:00:00 2001 From: David Engraf Date: Tue, 19 Mar 2013 13:29:55 +0100 Subject: hrtimer: Fix ktime_add_ns() overflow on 32bit architectures One can trigger an overflow when using ktime_add_ns() on a 32bit architecture not supporting CONFIG_KTIME_SCALAR. When passing a very high value for u64 nsec, e.g. 7881299347898368000 the do_div() function converts this value to seconds (7881299347) which is still to high to pass to the ktime_set() function as long. The result in is a negative value. The problem on my system occurs in the tick-sched.c, tick_nohz_stop_sched_tick() when time_delta is set to timekeeping_max_deferment(). The check for time_delta < KTIME_MAX is valid, thus ktime_add_ns() is called with a too large value resulting in a negative expire value. This leads to an endless loop in the ticker code: time_delta: 7881299347898368000 expires = ktime_add_ns(last_update, time_delta) expires: negative value This fix caps the value to KTIME_MAX. This error doesn't occurs on 64bit or architectures supporting CONFIG_KTIME_SCALAR (e.g. ARM, x86-32). Cc: stable@vger.kernel.org Signed-off-by: David Engraf [jstultz: Minor tweaks to commit message & header] Signed-off-by: John Stultz --- kernel/hrtimer.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 071e093..c0875ae 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -286,6 +286,10 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) } else { unsigned long rem = do_div(nsec, NSEC_PER_SEC); + /* Make sure nsec fits into long */ + if (unlikely(nsec > KTIME_SEC_MAX)) + return (ktime_t){ .tv64 = KTIME_MAX }; + tmp = ktime_set((long)nsec, rem); } -- cgit v1.1 From 4e8f8b34b92b6514cc070aeb94d317cadd5071d7 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Apr 2013 12:41:49 -0700 Subject: timekeeping: Make sure to notify hrtimers when TAI offset changes Now that we have CLOCK_TAI timers, make sure we notify hrtimer code when TAI offset is changed. Signed-off-by: John Stultz Link: http://lkml.kernel.org/r/1365622909-953-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c4d2a87..675f720 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -607,6 +607,7 @@ void timekeeping_set_tai_offset(s32 tai_offset) __timekeeping_set_tai_offset(tk, tai_offset); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + clock_was_set(); } /** @@ -1639,7 +1640,7 @@ int do_adjtimex(struct timex *txc) struct timekeeper *tk = &timekeeper; unsigned long flags; struct timespec ts; - s32 tai; + s32 orig_tai, tai; int ret; /* Validate the data before disabling interrupts */ @@ -1663,10 +1664,13 @@ int do_adjtimex(struct timex *txc) raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&timekeeper_seq); - tai = tk->tai_offset; + orig_tai = tai = tk->tai_offset; ret = __do_adjtimex(txc, &ts, &tai); - __timekeeping_set_tai_offset(tk, tai); + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tk, tai); + clock_was_set_delayed(); + } write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); -- cgit v1.1 From 5ed67f05f66c41e39880a6d61358438a25f9fee5 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 11 Mar 2013 13:12:21 +0400 Subject: posix timers: Allocate timer id per process (v2) Currently kernel generates IDs for posix timers in a global manner -- there's a kernel-wide IDR tree from which IDs are created. This makes it impossible to recreate a timer with a desired ID (in particular this is done by the CRIU checkpoint-restore project) -- since these IDs are global it may happen, that at the time we recreate a timer, the ID we want for it is already busy by some other timer. In order to address this, replace the IDR tree with a global hash table for timers and makes timer IDs unique per signal_struct (to which timers are linked anyway). With this, two timers belonging to different processes may have equal IDs and we can recreate either of them with the ID we want. Signed-off-by: Pavel Emelyanov Cc: Peter Zijlstra Cc: Michael Kerrisk Cc: Matthew Helsley Link: http://lkml.kernel.org/r/513D9FF5.9010004@parallels.com Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 106 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 69 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 2a2e173..34d7592 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -40,38 +40,31 @@ #include #include #include -#include +#include #include #include #include #include #include #include +#include /* - * Management arrays for POSIX timers. Timers are kept in slab memory - * Timer ids are allocated by an external routine that keeps track of the - * id and the timer. The external interface is: - * - * void *idr_find(struct idr *idp, int id); to find timer_id - * int idr_get_new(struct idr *idp, void *ptr); to get a new id and - * related it to - * void idr_remove(struct idr *idp, int id); to release - * void idr_init(struct idr *idp); to initialize - * which we supply. - * The idr_get_new *may* call slab for more memory so it must not be - * called under a spin lock. Likewise idr_remore may release memory - * (but it may be ok to do this under a lock...). - * idr_find is just a memory look up and is quite fast. A -1 return - * indicates that the requested id does not exist. + * Management arrays for POSIX timers. Timers are now kept in static hash table + * with 512 entries. + * Timer ids are allocated by local routine, which selects proper hash head by + * key, constructed from current->signal address and per signal struct counter. + * This keeps timer ids unique per process, but now they can intersect between + * processes. */ /* * Lets keep our timers in a slab cache :-) */ static struct kmem_cache *posix_timers_cache; -static struct idr posix_timers_id; -static DEFINE_SPINLOCK(idr_lock); + +static DEFINE_HASHTABLE(posix_timers_hashtable, 9); +static DEFINE_SPINLOCK(hash_lock); /* * we assume that the new SIGEV_THREAD_ID shares no bits with the other @@ -152,6 +145,57 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); __timr; \ }) +static int hash(struct signal_struct *sig, unsigned int nr) +{ + return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); +} + +static struct k_itimer *__posix_timers_find(struct hlist_head *head, + struct signal_struct *sig, + timer_t id) +{ + struct hlist_node *node; + struct k_itimer *timer; + + hlist_for_each_entry_rcu(timer, head, t_hash) { + if ((timer->it_signal == sig) && (timer->it_id == id)) + return timer; + } + return NULL; +} + +static struct k_itimer *posix_timer_by_id(timer_t id) +{ + struct signal_struct *sig = current->signal; + struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + + return __posix_timers_find(head, sig, id); +} + +static int posix_timer_add(struct k_itimer *timer) +{ + struct signal_struct *sig = current->signal; + int first_free_id = sig->posix_timer_id; + struct hlist_head *head; + int ret = -ENOENT; + + do { + spin_lock(&hash_lock); + head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)]; + if (!__posix_timers_find(head, sig, sig->posix_timer_id)) { + hlist_add_head_rcu(&timer->t_hash, head); + ret = sig->posix_timer_id; + } + if (++sig->posix_timer_id < 0) + sig->posix_timer_id = 0; + if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT)) + /* Loop over all possible ids completed */ + ret = -EAGAIN; + spin_unlock(&hash_lock); + } while (ret == -ENOENT); + return ret; +} + static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) { spin_unlock_irqrestore(&timr->it_lock, flags); @@ -298,7 +342,6 @@ static __init int init_posix_timers(void) posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, NULL); - idr_init(&posix_timers_id); return 0; } @@ -520,9 +563,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) { if (it_id_set) { unsigned long flags; - spin_lock_irqsave(&idr_lock, flags); - idr_remove(&posix_timers_id, tmr->it_id); - spin_unlock_irqrestore(&idr_lock, flags); + spin_lock_irqsave(&hash_lock, flags); + hlist_del_rcu(&tmr->t_hash); + spin_unlock_irqrestore(&hash_lock, flags); } put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); @@ -568,22 +611,11 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, return -EAGAIN; spin_lock_init(&new_timer->it_lock); - - idr_preload(GFP_KERNEL); - spin_lock_irq(&idr_lock); - error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT); - spin_unlock_irq(&idr_lock); - idr_preload_end(); - if (error < 0) { - /* - * Weird looking, but we return EAGAIN if the IDR is - * full (proper POSIX return value for this) - */ - if (error == -ENOSPC) - error = -EAGAIN; + new_timer_id = posix_timer_add(new_timer); + if (new_timer_id < 0) { + error = new_timer_id; goto out; } - new_timer_id = error; it_id_set = IT_ID_SET; new_timer->it_id = (timer_t) new_timer_id; @@ -661,7 +693,7 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) return NULL; rcu_read_lock(); - timr = idr_find(&posix_timers_id, (int)timer_id); + timr = posix_timer_by_id(timer_id); if (timr) { spin_lock_irqsave(&timr->it_lock, *flags); if (timr->it_signal == current->signal) { -- cgit v1.1 From 60cf7ea849e77c8782dee147cfb8c38d1984236e Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Tue, 26 Mar 2013 19:56:29 -0500 Subject: timer_list: Split timer_list_show_tickdevices Split timer_list_show_tickdevices() into the header printout and pull the rest up to timer_list_show. This is a preparatory patch for converting timer_list to a proper seqfile with its own iterator Signed-off-by: Nathan Zimmer Reported-by: Dave Jones Cc: John Stultz Cc: Stephen Boyd Link: http://lkml.kernel.org/r/1364345790-14577-2-git-send-email-nzimmer@sgi.com Signed-off-by: Thomas Gleixner --- kernel/time/timer_list.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index af5a7e9..380a589 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -133,7 +133,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - SEQ_printf(m, "\n"); SEQ_printf(m, "cpu: %d\n", cpu); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { SEQ_printf(m, " clock %d:\n", i); @@ -187,6 +186,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) #undef P #undef P_ns + SEQ_printf(m, "\n"); } #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -195,7 +195,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; - SEQ_printf(m, "\n"); SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); if (cpu < 0) SEQ_printf(m, "Broadcast device\n"); @@ -230,12 +229,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) print_name_offset(m, dev->event_handler); SEQ_printf(m, "\n"); SEQ_printf(m, " retries: %lu\n", dev->retries); + SEQ_printf(m, "\n"); } -static void timer_list_show_tickdevices(struct seq_file *m) +static void timer_list_show_tickdevices_header(struct seq_file *m) { - int cpu; - #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %08lx\n", @@ -246,12 +244,7 @@ static void timer_list_show_tickdevices(struct seq_file *m) #endif SEQ_printf(m, "\n"); #endif - for_each_online_cpu(cpu) - print_tickdevice(m, tick_get_device(cpu), cpu); - SEQ_printf(m, "\n"); } -#else -static void timer_list_show_tickdevices(struct seq_file *m) { } #endif static int timer_list_show(struct seq_file *m, void *v) @@ -262,12 +255,16 @@ static int timer_list_show(struct seq_file *m, void *v) SEQ_printf(m, "Timer List Version: v0.7\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + SEQ_printf(m, "\n"); for_each_online_cpu(cpu) print_cpu(m, cpu, now); - SEQ_printf(m, "\n"); - timer_list_show_tickdevices(m); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + timer_list_show_tickdevices_header(m); + for_each_online_cpu(cpu) + print_tickdevice(m, tick_get_device(cpu), cpu); +#endif return 0; } -- cgit v1.1 From b3956a896ea57f25cacd74708b8fab611543a81d Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Tue, 26 Mar 2013 19:56:30 -0500 Subject: timer_list: Convert timer list to be a proper seq_file When running with 4096 cores attemping to read /proc/timer_list will fail with an ENOMEM condition. On a sufficantly large systems the total amount of data is more then 4mb, so it won't fit into a single buffer. The failure can also occur on smaller systems when memory fragmentation is high as reported by Dave Jones. Convert /proc/timer_list to a proper seq_file with its own iterator. This is a little more complex given that we have to make two passes with two separate headers. sysrq_timer_list_show also needed to be updated to reflect the fact that now timer_list_show only does one cpu at at time. Signed-off-by: Nathan Zimmer Reported-by: Dave Jones Cc: John Stultz Cc: Stephen Boyd Link: http://lkml.kernel.org/r/1364345790-14577-3-git-send-email-nzimmer@sgi.com Signed-off-by: Thomas Gleixner --- kernel/time/timer_list.c | 89 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 77 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 380a589..3bdf283 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -20,6 +20,13 @@ #include + +struct timer_list_iter { + int cpu; + bool second_pass; + u64 now; +}; + typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); @@ -247,43 +254,101 @@ static void timer_list_show_tickdevices_header(struct seq_file *m) } #endif -static int timer_list_show(struct seq_file *m, void *v) +static inline void timer_list_header(struct seq_file *m, u64 now) { - u64 now = ktime_to_ns(ktime_get()); - int cpu; - SEQ_printf(m, "Timer List Version: v0.7\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); SEQ_printf(m, "\n"); +} + +static int timer_list_show(struct seq_file *m, void *v) +{ + struct timer_list_iter *iter = v; + u64 now = ktime_to_ns(ktime_get()); + + if (iter->cpu == -1 && !iter->second_pass) + timer_list_header(m, now); + else if (!iter->second_pass) + print_cpu(m, iter->cpu, iter->now); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + else if (iter->cpu == -1 && iter->second_pass) + timer_list_show_tickdevices_header(m); + else + print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); +#endif + return 0; +} + +void sysrq_timer_list_show(void) +{ + u64 now = ktime_to_ns(ktime_get()); + int cpu; + + timer_list_header(NULL, now); for_each_online_cpu(cpu) - print_cpu(m, cpu, now); + print_cpu(NULL, cpu, now); #ifdef CONFIG_GENERIC_CLOCKEVENTS - timer_list_show_tickdevices_header(m); + timer_list_show_tickdevices_header(NULL); for_each_online_cpu(cpu) - print_tickdevice(m, tick_get_device(cpu), cpu); + print_tickdevice(NULL, tick_get_device(cpu), cpu); #endif + return; +} - return 0; +static void *timer_list_start(struct seq_file *file, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + + if (!*offset) { + iter->cpu = -1; + iter->now = ktime_to_ns(ktime_get()); + } else if (iter->cpu >= nr_cpu_ids) { +#ifdef CONFIG_GENERIC_CLOCKEVENTS + if (!iter->second_pass) { + iter->cpu = -1; + iter->second_pass = true; + } else + return NULL; +#else + return NULL; +#endif + } + return iter; } -void sysrq_timer_list_show(void) +static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) { - timer_list_show(NULL, NULL); + struct timer_list_iter *iter = file->private; + iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); + ++*offset; + return timer_list_start(file, offset); } +static void timer_list_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations timer_list_sops = { + .start = timer_list_start, + .next = timer_list_next, + .stop = timer_list_stop, + .show = timer_list_show, +}; + static int timer_list_open(struct inode *inode, struct file *filp) { - return single_open(filp, timer_list_show, NULL); + return seq_open_private(filp, &timer_list_sops, + sizeof(struct timer_list_iter)); } static const struct file_operations timer_list_fops = { .open = timer_list_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = seq_release_private, }; static int __init init_timer_list_procfs(void) -- cgit v1.1 From c038c1c44179c80da6201f91ef354e48d5689617 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 17 Apr 2013 10:26:06 -0700 Subject: clockevents: Switch into oneshot mode even if broadcast registered late tick_oneshot_notify() is used to notify a particular CPU to try to switch into oneshot mode after a oneshot capable tick device is registered and tick_clock_notify() is used to notify all CPUs to try to switch into oneshot mode after a high res clocksource is registered. There is one caveat; if the tick devices suffer from FEAT_C3_STOP we don't try to switch into oneshot mode unless we have a oneshot capable broadcast device already registered. If the broadcast device is registered after the tick devices that have FEAT_C3_STOP we'll never try to switch into oneshot mode again, causing us to be stuck in periodic mode forever. Avoid this scenario by calling tick_clock_notify() after we register the broadcast device so that we try to switch into oneshot mode on all CPUs one more time. [ tglx: Adopted to timers/core and added a comment ] Signed-off-by: Stephen Boyd Link: http://lkml.kernel.org/r/1366219566-29783-1-git-send-email-sboyd@codeaurora.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index d76d816..f8d2109 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -75,6 +75,16 @@ int tick_check_broadcast_device(struct clock_event_device *dev) tick_broadcast_device.evtdev = dev; if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(dev); + /* + * Inform all cpus about this. We might be in a situation + * where we did not switch to oneshot mode because the per cpu + * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack + * of a oneshot capable broadcast device. Without that + * notification the systems stays stuck in periodic mode + * forever. + */ + if (dev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_clock_notify(); return 1; } -- cgit v1.1 From d2054b2c11682495fca41e9d4092e654df63b517 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 18 Apr 2013 12:51:19 +0200 Subject: posix-timers: Remove unused variable Remove the unused variable *node introduced by commit 5ed67f05 (posix timers: Allocate timer id per process) Signed-off-by: Thomas Gleixner Cc: Pavel Emelyanov --- kernel/posix-timers.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 34d7592..424c2d4 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -154,7 +154,6 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head, struct signal_struct *sig, timer_t id) { - struct hlist_node *node; struct k_itimer *timer; hlist_for_each_entry_rcu(timer, head, t_hash) { -- cgit v1.1 From 77c675ba18836802f6b73d2d773481d06ebc0f04 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Apr 2013 09:37:04 +0200 Subject: timekeeping: Update tk->cycle_last in resume commit 7ec98e15aa (timekeeping: Delay update of clock->cycle_last) forgot to update tk->cycle_last in the resume path. This results in a stale value versus clock->cycle_last and prevents resume in the worst case. Reported-by: Jiri Slaby Reported-and-tested-by: Borislav Petkov Acked-by: John Stultz Cc: Linux-pm mailing list Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304211648150.21884@ionos Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 675f720..98cd470 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -951,7 +951,7 @@ static void timekeeping_resume(void) __timekeeping_inject_sleeptime(tk, &ts_delta); /* Re-base the last cycle value */ - clock->cycle_last = cycle_now; + tk->cycle_last = clock->cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update(tk, false, true); -- cgit v1.1 From 6f7a05d7018de222e40ca003721037a530979974 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 11:45:53 +0200 Subject: clockevents: Set dummy handler on CPU_DEAD shutdown Vitaliy reported that a per cpu HPET timer interrupt crashes the system during hibernation. What happens is that the per cpu HPET timer gets shut down when the nonboot cpus are stopped. When the nonboot cpus are onlined again the HPET code sets up the MSI interrupt which fires before the clock event device is registered. The event handler is still set to hrtimer_interrupt, which then crashes the machine due to highres mode not being active. See http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=700333 There is no real good way to avoid that in the HPET code. The HPET code alrady has a mechanism to detect spurious interrupts when event handler == NULL for a similar reason. We can handle that in the clockevent/tick layer and replace the previous functional handler with a dummy handler like we do in tick_setup_new_device(). The original clockevents code did this in clockevents_exchange_device(), but that got removed by commit 7c1e76897 (clockevents: prevent clockevent event_handler ending up handler_noop) which forgot to fix it up in tick_shutdown(). Same issue with the broadcast device. Reported-by: Vitaliy Fillipov Cc: Ben Hutchings Cc: stable@vger.kernel.org Cc: 700333@bugs.debian.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 4 ++++ kernel/time/tick-common.c | 1 + 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 6e23fde..61d00a8 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -66,6 +66,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) */ int tick_check_broadcast_device(struct clock_event_device *dev) { + struct clock_event_device *cur = tick_broadcast_device.evtdev; + if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || (tick_broadcast_device.evtdev && tick_broadcast_device.evtdev->rating >= dev->rating) || @@ -73,6 +75,8 @@ int tick_check_broadcast_device(struct clock_event_device *dev) return 0; clockevents_exchange_device(tick_broadcast_device.evtdev, dev); + if (cur) + cur->event_handler = clockevents_handle_noop; tick_broadcast_device.evtdev = dev; if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(dev); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 74413e3..6176a3e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -323,6 +323,7 @@ static void tick_shutdown(unsigned int *cpup) */ dev->mode = CLOCK_EVT_MODE_UNUSED; clockevents_exchange_device(dev, NULL); + dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; } raw_spin_unlock_irqrestore(&tick_device_lock, flags); -- cgit v1.1