From 22001821d9cb6ddb83ee4e1f81e6b905de623165 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Jun 2014 23:59:12 +0000 Subject: acct: Use ktime_get_ts() do_posix_clock_monotonic_gettime() is a leftover from the initial posix timer implementation which maps to ktime_get_ts() Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140611234606.764810535@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/acct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 808a86f..1be013c 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -484,7 +484,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); /* calculate run_time in nsec*/ - do_posix_clock_monotonic_gettime(&uptime); + ktime_get_ts(&uptime); run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC + current->group_leader->start_time.tv_nsec; -- cgit v1.1 From b5d7682533941edb121f7495bdb2a17abac03ff3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Jun 2014 23:59:13 +0000 Subject: delayacct: Use ktime_get_ts() do_posix_clock_monotonic_gettime() is a leftover from the initial posix timer implementation which maps to ktime_get_ts(). Remove the silly wrapper while at it. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140611234606.931409215@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/delayacct.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 54996b7..de699f4 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -46,16 +46,6 @@ void __delayacct_tsk_init(struct task_struct *tsk) } /* - * Start accounting for a delay statistic using - * its starting timestamp (@start) - */ - -static inline void delayacct_start(struct timespec *start) -{ - do_posix_clock_monotonic_gettime(start); -} - -/* * Finish delay accounting for a statistic using * its timestamps (@start, @end), accumalator (@total) and @count */ @@ -67,7 +57,7 @@ static void delayacct_end(struct timespec *start, struct timespec *end, s64 ns; unsigned long flags; - do_posix_clock_monotonic_gettime(end); + ktime_get_ts(end); ts = timespec_sub(*end, *start); ns = timespec_to_ns(&ts); if (ns < 0) @@ -81,7 +71,7 @@ static void delayacct_end(struct timespec *start, struct timespec *end, void __delayacct_blkio_start(void) { - delayacct_start(¤t->delays->blkio_start); + ktime_get_ts(¤t->delays->blkio_start); } void __delayacct_blkio_end(void) @@ -169,7 +159,7 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) void __delayacct_freepages_start(void) { - delayacct_start(¤t->delays->freepages_start); + ktime_get_ts(¤t->delays->freepages_start); } void __delayacct_freepages_end(void) -- cgit v1.1 From 4e8c5847d1c55efed896508fb769f78ab07b968a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Jun 2014 23:59:13 +0000 Subject: tsacct: Use ktime_get_ts() do_posix_clock_monotonic_gettime() is a leftover from the initial posix timer implementation which maps to ktime_get_ts() Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140611234606.840900621@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/tsacct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index a1dd9a1..ea6d170 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -38,7 +38,7 @@ void bacct_add_tsk(struct user_namespace *user_ns, BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); /* calculate task elapsed time in timespec */ - do_posix_clock_monotonic_gettime(&uptime); + ktime_get_ts(&uptime); ts = timespec_sub(uptime, tsk->start_time); /* rebase elapsed time to usec (should never be negative) */ ac_etime = timespec_to_ns(&ts); -- cgit v1.1 From a9821c741c960a77a7f08491883f9cc4bffd2279 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Jun 2014 23:59:16 +0000 Subject: kdb: Use ktime_get_ts() do_posix_clock_monotonic_gettime() is a leftover from the initial posix timer implementation which maps to ktime_get_ts(). Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Cc: Jason Wessel Link: http://lkml.kernel.org/r/20140611234607.261629142@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/debug/kdb/kdb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2f7c760..379650b 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2472,7 +2472,7 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm) static void kdb_sysinfo(struct sysinfo *val) { struct timespec uptime; - do_posix_clock_monotonic_gettime(&uptime); + ktime_get_ts(&uptime); memset(val, 0, sizeof(*val)); val->uptime = uptime.tv_sec; val->loads[0] = avenrun[0]; -- cgit v1.1 From f037c1171db79be2a047b1a5aafa2fd1f05051cb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Jun 2014 23:59:17 +0000 Subject: fork: Use ktime_get_ts() do_posix_clock_monotonic_gettime() is a leftover from the initial posix timer implementation which maps to ktime_get_ts(). Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/20140611234607.427408044@linutronix.de Signed-off-by: Thomas Gleixner Cc: Oleg Nesterov --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d2799d1..ea0dd70 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1262,7 +1262,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, posix_cpu_timers_init(p); - do_posix_clock_monotonic_gettime(&p->start_time); + ktime_get_ts(&p->start_time); p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); p->io_context = NULL; -- cgit v1.1 From 71d5d2b7229dc6a7ec0168076096c847b2bb2f48 Mon Sep 17 00:00:00 2001 From: Pramod Gurav Date: Fri, 13 Jun 2014 11:49:42 +0530 Subject: alarmtimer: Export symbols of alarmtimer_get_rtcdev Export symbol of alarmtimer_get_rtcdev so that it is used by any driver when built as module like, drivers/staging/android/alarm-dev.c. CC: John Stultz CC: Marcus Gelderie Signed-off-by: Pramod Gurav Signed-off-by: Greg Kroah-Hartman --- kernel/time/alarmtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 88c9c65..a53ba0b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -71,7 +71,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void) return ret; } - +EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); static int alarmtimer_rtc_add_device(struct device *dev, struct class_interface *class_intf) -- cgit v1.1 From 43a775916d63d1c822107b39987192ca5ced445c Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 9 Jun 2014 16:20:05 +0800 Subject: genirq: Export irq_domain_disassociate() to architecture interrupt drivers Export irq_domain_disassociate() to architecture interrupt drivers, so it could be used to handle legacy IRQ descriptors on x86. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Joerg Roedel Cc: Paul Gortmaker Cc: Greg Kroah-Hartman Cc: Benjamin Herrenschmidt Cc: Grant Likely Cc: Rafael J. Wysocki Cc: Bjorn Helgaas Cc: Randy Dunlap Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1402302011-23642-37-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index eb5e10e..6534ff6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -231,7 +231,7 @@ void irq_set_default_host(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_set_default_host); -static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) +void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) { struct irq_data *irq_data = irq_get_irq_data(irq); irq_hw_number_t hwirq; -- cgit v1.1 From 5cee964597260237dd2cabb3ec22bba0da24b25d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 22 Jun 2014 12:06:40 +0200 Subject: time/timers: Move all time(r) related files into kernel/time Except for Kconfig.HZ. That needs a separate treatment. Signed-off-by: Thomas Gleixner --- kernel/Makefile | 25 +- kernel/hrtimer.c | 1915 ---------------------------------------- kernel/itimer.c | 301 ------- kernel/posix-cpu-timers.c | 1490 ------------------------------- kernel/posix-timers.c | 1121 ----------------------- kernel/time.c | 714 --------------- kernel/time/Makefile | 17 + kernel/time/hrtimer.c | 1915 ++++++++++++++++++++++++++++++++++++++++ kernel/time/itimer.c | 301 +++++++ kernel/time/posix-cpu-timers.c | 1490 +++++++++++++++++++++++++++++++ kernel/time/posix-timers.c | 1121 +++++++++++++++++++++++ kernel/time/time.c | 714 +++++++++++++++ kernel/time/timeconst.bc | 108 +++ kernel/time/timer.c | 1734 ++++++++++++++++++++++++++++++++++++ kernel/timeconst.bc | 108 --- kernel/timer.c | 1734 ------------------------------------ 16 files changed, 7404 insertions(+), 7404 deletions(-) delete mode 100644 kernel/hrtimer.c delete mode 100644 kernel/itimer.c delete mode 100644 kernel/posix-cpu-timers.c delete mode 100644 kernel/posix-timers.c delete mode 100644 kernel/time.c create mode 100644 kernel/time/hrtimer.c create mode 100644 kernel/time/itimer.c create mode 100644 kernel/time/posix-cpu-timers.c create mode 100644 kernel/time/posix-timers.c create mode 100644 kernel/time/time.c create mode 100644 kernel/time/timeconst.bc create mode 100644 kernel/time/timer.c delete mode 100644 kernel/timeconst.bc delete mode 100644 kernel/timer.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index f2a8b62..973a40c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -3,12 +3,11 @@ # obj-y = fork.o exec_domain.o panic.o \ - cpu.o exit.o itimer.o time.o softirq.o resource.o \ - sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ + cpu.o exit.o softirq.o resource.o \ + sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ - extable.o params.o posix-timers.o \ - kthread.o sys_ni.o posix-cpu-timers.o \ - hrtimer.o nsproxy.o \ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o smpboot.o @@ -110,22 +109,6 @@ targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) -$(obj)/time.o: $(obj)/timeconst.h - -quiet_cmd_hzfile = HZFILE $@ - cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ - -targets += hz.bc -$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE - $(call if_changed,hzfile) - -quiet_cmd_bc = BC $@ - cmd_bc = bc -q $(filter-out FORCE,$^) > $@ - -targets += timeconst.h -$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE - $(call if_changed,bc) - ############################################################################### # # Roll all the X.509 certificates that we can find together and pull them into diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c deleted file mode 100644 index 3ab2899..0000000 --- a/kernel/hrtimer.c +++ /dev/null @@ -1,1915 +0,0 @@ -/* - * linux/kernel/hrtimer.c - * - * Copyright(C) 2005-2006, Thomas Gleixner - * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner - * - * High-resolution kernel timers - * - * In contrast to the low-resolution timeout API implemented in - * kernel/timer.c, hrtimers provide finer resolution and accuracy - * depending on system configuration and capabilities. - * - * These timers are currently used for: - * - itimers - * - POSIX timers - * - nanosleep - * - precise in-kernel timing - * - * Started by: Thomas Gleixner and Ingo Molnar - * - * Credits: - * based on kernel/timer.c - * - * Help, testing, suggestions, bugfixes, improvements were - * provided by: - * - * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel - * et. al. - * - * For licencing details see kernel-base/COPYING - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -/* - * The timer bases: - * - * There are more clockids then hrtimer bases. Thus, we index - * into the timer bases by the hrtimer_base_type enum. When trying - * to reach a base using a clockid, hrtimer_clockid_to_base() - * is used to convert from clockid to the proper hrtimer_base_type. - */ -DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = -{ - - .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), - .clock_base = - { - { - .index = HRTIMER_BASE_MONOTONIC, - .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, - .resolution = KTIME_LOW_RES, - }, - { - .index = HRTIMER_BASE_REALTIME, - .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, - .resolution = KTIME_LOW_RES, - }, - { - .index = HRTIMER_BASE_BOOTTIME, - .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, - .resolution = KTIME_LOW_RES, - }, - { - .index = HRTIMER_BASE_TAI, - .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, - .resolution = KTIME_LOW_RES, - }, - } -}; - -static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, - [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, - [CLOCK_TAI] = HRTIMER_BASE_TAI, -}; - -static inline int hrtimer_clockid_to_base(clockid_t clock_id) -{ - return hrtimer_clock_to_base_table[clock_id]; -} - - -/* - * Get the coarse grained time at the softirq based on xtime and - * wall_to_monotonic. - */ -static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) -{ - ktime_t xtim, mono, boot; - struct timespec xts, tom, slp; - s32 tai_offset; - - get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); - tai_offset = timekeeping_get_tai_offset(); - - xtim = timespec_to_ktime(xts); - mono = ktime_add(xtim, timespec_to_ktime(tom)); - boot = ktime_add(mono, timespec_to_ktime(slp)); - base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; - base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; - base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; - base->clock_base[HRTIMER_BASE_TAI].softirq_time = - ktime_add(xtim, ktime_set(tai_offset, 0)); -} - -/* - * Functions and macros which are different for UP/SMP systems are kept in a - * single place - */ -#ifdef CONFIG_SMP - -/* - * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock - * means that all timers which are tied to this base via timer->base are - * locked, and the base itself is locked too. - * - * So __run_timers/migrate_timers can safely modify all timers which could - * be found on the lists/queues. - * - * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. - */ -static -struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, - unsigned long *flags) -{ - struct hrtimer_clock_base *base; - - for (;;) { - base = timer->base; - if (likely(base != NULL)) { - raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); - if (likely(base == timer->base)) - return base; - /* The timer has migrated to another CPU: */ - raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); - } - cpu_relax(); - } -} - -/* - * With HIGHRES=y we do not migrate the timer when it is expiring - * before the next event on the target cpu because we cannot reprogram - * the target cpu hardware and we would cause it to fire late. - * - * Called with cpu_base->lock of target cpu held. - */ -static int -hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) -{ -#ifdef CONFIG_HIGH_RES_TIMERS - ktime_t expires; - - if (!new_base->cpu_base->hres_active) - return 0; - - expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires.tv64 <= new_base->cpu_base->expires_next.tv64; -#else - return 0; -#endif -} - -/* - * Switch the timer base to the current CPU when possible. - */ -static inline struct hrtimer_clock_base * -switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, - int pinned) -{ - struct hrtimer_clock_base *new_base; - struct hrtimer_cpu_base *new_cpu_base; - int this_cpu = smp_processor_id(); - int cpu = get_nohz_timer_target(pinned); - int basenum = base->index; - -again: - new_cpu_base = &per_cpu(hrtimer_bases, cpu); - new_base = &new_cpu_base->clock_base[basenum]; - - if (base != new_base) { - /* - * We are trying to move timer to new_base. - * However we can't change timer's base while it is running, - * so we keep it on the same CPU. No hassle vs. reprogramming - * the event source in the high resolution case. The softirq - * code will take care of this when the timer function has - * completed. There is no conflict as we hold the lock until - * the timer is enqueued. - */ - if (unlikely(hrtimer_callback_running(timer))) - return base; - - /* See the comment in lock_timer_base() */ - timer->base = NULL; - raw_spin_unlock(&base->cpu_base->lock); - raw_spin_lock(&new_base->cpu_base->lock); - - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; - raw_spin_unlock(&new_base->cpu_base->lock); - raw_spin_lock(&base->cpu_base->lock); - timer->base = base; - goto again; - } - timer->base = new_base; - } else { - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; - goto again; - } - } - return new_base; -} - -#else /* CONFIG_SMP */ - -static inline struct hrtimer_clock_base * -lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) -{ - struct hrtimer_clock_base *base = timer->base; - - raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); - - return base; -} - -# define switch_hrtimer_base(t, b, p) (b) - -#endif /* !CONFIG_SMP */ - -/* - * Functions for the union type storage format of ktime_t which are - * too large for inlining: - */ -#if BITS_PER_LONG < 64 -# ifndef CONFIG_KTIME_SCALAR -/** - * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable - * @kt: addend - * @nsec: the scalar nsec value to add - * - * Returns the sum of kt and nsec in ktime_t format - */ -ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) -{ - ktime_t tmp; - - if (likely(nsec < NSEC_PER_SEC)) { - tmp.tv64 = nsec; - } else { - unsigned long rem = do_div(nsec, NSEC_PER_SEC); - - /* Make sure nsec fits into long */ - if (unlikely(nsec > KTIME_SEC_MAX)) - return (ktime_t){ .tv64 = KTIME_MAX }; - - tmp = ktime_set((long)nsec, rem); - } - - return ktime_add(kt, tmp); -} - -EXPORT_SYMBOL_GPL(ktime_add_ns); - -/** - * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable - * @kt: minuend - * @nsec: the scalar nsec value to subtract - * - * Returns the subtraction of @nsec from @kt in ktime_t format - */ -ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec) -{ - ktime_t tmp; - - if (likely(nsec < NSEC_PER_SEC)) { - tmp.tv64 = nsec; - } else { - unsigned long rem = do_div(nsec, NSEC_PER_SEC); - - tmp = ktime_set((long)nsec, rem); - } - - return ktime_sub(kt, tmp); -} - -EXPORT_SYMBOL_GPL(ktime_sub_ns); -# endif /* !CONFIG_KTIME_SCALAR */ - -/* - * Divide a ktime value by a nanosecond value - */ -u64 ktime_divns(const ktime_t kt, s64 div) -{ - u64 dclc; - int sft = 0; - - dclc = ktime_to_ns(kt); - /* Make sure the divisor is less than 2^32: */ - while (div >> 32) { - sft++; - div >>= 1; - } - dclc >>= sft; - do_div(dclc, (unsigned long) div); - - return dclc; -} -#endif /* BITS_PER_LONG >= 64 */ - -/* - * Add two ktime values and do a safety check for overflow: - */ -ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) -{ - ktime_t res = ktime_add(lhs, rhs); - - /* - * We use KTIME_SEC_MAX here, the maximum timeout which we can - * return to user space in a timespec: - */ - if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64) - res = ktime_set(KTIME_SEC_MAX, 0); - - return res; -} - -EXPORT_SYMBOL_GPL(ktime_add_safe); - -#ifdef CONFIG_DEBUG_OBJECTS_TIMERS - -static struct debug_obj_descr hrtimer_debug_descr; - -static void *hrtimer_debug_hint(void *addr) -{ - return ((struct hrtimer *) addr)->function; -} - -/* - * fixup_init is called when: - * - an active object is initialized - */ -static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) -{ - struct hrtimer *timer = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - hrtimer_cancel(timer); - debug_object_init(timer, &hrtimer_debug_descr); - return 1; - default: - return 0; - } -} - -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - */ -static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) -{ - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - WARN_ON_ONCE(1); - return 0; - - case ODEBUG_STATE_ACTIVE: - WARN_ON(1); - - default: - return 0; - } -} - -/* - * fixup_free is called when: - * - an active object is freed - */ -static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) -{ - struct hrtimer *timer = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - hrtimer_cancel(timer); - debug_object_free(timer, &hrtimer_debug_descr); - return 1; - default: - return 0; - } -} - -static struct debug_obj_descr hrtimer_debug_descr = { - .name = "hrtimer", - .debug_hint = hrtimer_debug_hint, - .fixup_init = hrtimer_fixup_init, - .fixup_activate = hrtimer_fixup_activate, - .fixup_free = hrtimer_fixup_free, -}; - -static inline void debug_hrtimer_init(struct hrtimer *timer) -{ - debug_object_init(timer, &hrtimer_debug_descr); -} - -static inline void debug_hrtimer_activate(struct hrtimer *timer) -{ - debug_object_activate(timer, &hrtimer_debug_descr); -} - -static inline void debug_hrtimer_deactivate(struct hrtimer *timer) -{ - debug_object_deactivate(timer, &hrtimer_debug_descr); -} - -static inline void debug_hrtimer_free(struct hrtimer *timer) -{ - debug_object_free(timer, &hrtimer_debug_descr); -} - -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode); - -void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - debug_object_init_on_stack(timer, &hrtimer_debug_descr); - __hrtimer_init(timer, clock_id, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); - -void destroy_hrtimer_on_stack(struct hrtimer *timer) -{ - debug_object_free(timer, &hrtimer_debug_descr); -} - -#else -static inline void debug_hrtimer_init(struct hrtimer *timer) { } -static inline void debug_hrtimer_activate(struct hrtimer *timer) { } -static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } -#endif - -static inline void -debug_init(struct hrtimer *timer, clockid_t clockid, - enum hrtimer_mode mode) -{ - debug_hrtimer_init(timer); - trace_hrtimer_init(timer, clockid, mode); -} - -static inline void debug_activate(struct hrtimer *timer) -{ - debug_hrtimer_activate(timer); - trace_hrtimer_start(timer); -} - -static inline void debug_deactivate(struct hrtimer *timer) -{ - debug_hrtimer_deactivate(timer); - trace_hrtimer_cancel(timer); -} - -/* High resolution timer related functions */ -#ifdef CONFIG_HIGH_RES_TIMERS - -/* - * High resolution timer enabled ? - */ -static int hrtimer_hres_enabled __read_mostly = 1; - -/* - * Enable / Disable high resolution mode - */ -static int __init setup_hrtimer_hres(char *str) -{ - if (!strcmp(str, "off")) - hrtimer_hres_enabled = 0; - else if (!strcmp(str, "on")) - hrtimer_hres_enabled = 1; - else - return 0; - return 1; -} - -__setup("highres=", setup_hrtimer_hres); - -/* - * hrtimer_high_res_enabled - query, if the highres mode is enabled - */ -static inline int hrtimer_is_hres_enabled(void) -{ - return hrtimer_hres_enabled; -} - -/* - * Is the high resolution mode active ? - */ -static inline int hrtimer_hres_active(void) -{ - return __this_cpu_read(hrtimer_bases.hres_active); -} - -/* - * Reprogram the event source with checking both queues for the - * next event - * Called with interrupts disabled and base->lock held - */ -static void -hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) -{ - int i; - struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t expires, expires_next; - - expires_next.tv64 = KTIME_MAX; - - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { - struct hrtimer *timer; - struct timerqueue_node *next; - - next = timerqueue_getnext(&base->active); - if (!next) - continue; - timer = container_of(next, struct hrtimer, node); - - expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - /* - * clock_was_set() has changed base->offset so the - * result might be negative. Fix it up to prevent a - * false positive in clockevents_program_event() - */ - if (expires.tv64 < 0) - expires.tv64 = 0; - if (expires.tv64 < expires_next.tv64) - expires_next = expires; - } - - if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) - return; - - cpu_base->expires_next.tv64 = expires_next.tv64; - - /* - * If a hang was detected in the last timer interrupt then we - * leave the hang delay active in the hardware. We want the - * system to make progress. That also prevents the following - * scenario: - * T1 expires 50ms from now - * T2 expires 5s from now - * - * T1 is removed, so this code is called and would reprogram - * the hardware to 5s from now. Any hrtimer_start after that - * will not reprogram the hardware due to hang_detected being - * set. So we'd effectivly block all timers until the T2 event - * fires. - */ - if (cpu_base->hang_detected) - return; - - if (cpu_base->expires_next.tv64 != KTIME_MAX) - tick_program_event(cpu_base->expires_next, 1); -} - -/* - * Shared reprogramming for clock_realtime and clock_monotonic - * - * When a timer is enqueued and expires earlier than the already enqueued - * timers, we have to check, whether it expires earlier than the timer for - * which the clock event device was armed. - * - * Called with interrupts disabled and base->cpu_base.lock held - */ -static int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - int res; - - WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); - - /* - * When the callback is running, we do not reprogram the clock event - * device. The timer callback is either running on a different CPU or - * the callback is executed in the hrtimer_interrupt context. The - * reprogramming is handled either by the softirq, which called the - * callback or at the end of the hrtimer_interrupt. - */ - if (hrtimer_callback_running(timer)) - return 0; - - /* - * CLOCK_REALTIME timer might be requested with an absolute - * expiry time which is less than base->offset. Nothing wrong - * about that, just avoid to call into the tick code, which - * has now objections against negative expiry values. - */ - if (expires.tv64 < 0) - return -ETIME; - - if (expires.tv64 >= cpu_base->expires_next.tv64) - return 0; - - /* - * If a hang was detected in the last timer interrupt then we - * do not schedule a timer which is earlier than the expiry - * which we enforced in the hang detection. We want the system - * to make progress. - */ - if (cpu_base->hang_detected) - return 0; - - /* - * Clockevents returns -ETIME, when the event was in the past. - */ - res = tick_program_event(expires, 0); - if (!IS_ERR_VALUE(res)) - cpu_base->expires_next = expires; - return res; -} - -/* - * Initialize the high resolution related parts of cpu_base - */ -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) -{ - base->expires_next.tv64 = KTIME_MAX; - base->hres_active = 0; -} - -/* - * When High resolution timers are active, try to reprogram. Note, that in case - * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry - * check happens. The timer gets enqueued into the rbtree. The reprogramming - * and expiry check is done in the hrtimer_interrupt or in the softirq. - */ -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return base->cpu_base->hres_active && hrtimer_reprogram(timer, base); -} - -static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) -{ - ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; - ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; - ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - - return ktime_get_update_offsets(offs_real, offs_boot, offs_tai); -} - -/* - * Retrigger next event is called after clock was set - * - * Called with interrupts disabled via on_each_cpu() - */ -static void retrigger_next_event(void *arg) -{ - struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); - - if (!hrtimer_hres_active()) - return; - - raw_spin_lock(&base->lock); - hrtimer_update_base(base); - hrtimer_force_reprogram(base, 0); - raw_spin_unlock(&base->lock); -} - -/* - * Switch to high resolution mode - */ -static int hrtimer_switch_to_hres(void) -{ - int i, cpu = smp_processor_id(); - struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); - unsigned long flags; - - if (base->hres_active) - return 1; - - local_irq_save(flags); - - if (tick_init_highres()) { - local_irq_restore(flags); - printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", cpu); - return 0; - } - base->hres_active = 1; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) - base->clock_base[i].resolution = KTIME_HIGH_RES; - - tick_setup_sched_timer(); - /* "Retrigger" the interrupt to get things going */ - retrigger_next_event(NULL); - local_irq_restore(flags); - return 1; -} - -static void clock_was_set_work(struct work_struct *work) -{ - clock_was_set(); -} - -static DECLARE_WORK(hrtimer_work, clock_was_set_work); - -/* - * Called from timekeeping and resume code to reprogramm the hrtimer - * interrupt device on all cpus. - */ -void clock_was_set_delayed(void) -{ - schedule_work(&hrtimer_work); -} - -#else - -static inline int hrtimer_hres_active(void) { return 0; } -static inline int hrtimer_is_hres_enabled(void) { return 0; } -static inline int hrtimer_switch_to_hres(void) { return 0; } -static inline void -hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return 0; -} -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } -static inline void retrigger_next_event(void *arg) { } - -#endif /* CONFIG_HIGH_RES_TIMERS */ - -/* - * Clock realtime was set - * - * Change the offset of the realtime clock vs. the monotonic - * clock. - * - * We might have to reprogram the high resolution timer interrupt. On - * SMP we call the architecture specific code to retrigger _all_ high - * resolution timer interrupts. On UP we just disable interrupts and - * call the high resolution interrupt code. - */ -void clock_was_set(void) -{ -#ifdef CONFIG_HIGH_RES_TIMERS - /* Retrigger the CPU local events everywhere */ - on_each_cpu(retrigger_next_event, NULL, 1); -#endif - timerfd_clock_was_set(); -} - -/* - * During resume we might have to reprogram the high resolution timer - * interrupt on all online CPUs. However, all other CPUs will be - * stopped with IRQs interrupts disabled so the clock_was_set() call - * must be deferred. - */ -void hrtimers_resume(void) -{ - WARN_ONCE(!irqs_disabled(), - KERN_INFO "hrtimers_resume() called with IRQs enabled!"); - - /* Retrigger on the local CPU */ - retrigger_next_event(NULL); - /* And schedule a retrigger for all others */ - clock_was_set_delayed(); -} - -static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) -{ -#ifdef CONFIG_TIMER_STATS - if (timer->start_site) - return; - timer->start_site = __builtin_return_address(0); - memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); - timer->start_pid = current->pid; -#endif -} - -static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer) -{ -#ifdef CONFIG_TIMER_STATS - timer->start_site = NULL; -#endif -} - -static inline void timer_stats_account_hrtimer(struct hrtimer *timer) -{ -#ifdef CONFIG_TIMER_STATS - if (likely(!timer_stats_active)) - return; - timer_stats_update_stats(timer, timer->start_pid, timer->start_site, - timer->function, timer->start_comm, 0); -#endif -} - -/* - * Counterpart to lock_hrtimer_base above: - */ -static inline -void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) -{ - raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); -} - -/** - * hrtimer_forward - forward the timer expiry - * @timer: hrtimer to forward - * @now: forward past this time - * @interval: the interval to forward - * - * Forward the timer expiry so it will expire in the future. - * Returns the number of overruns. - */ -u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) -{ - u64 orun = 1; - ktime_t delta; - - delta = ktime_sub(now, hrtimer_get_expires(timer)); - - if (delta.tv64 < 0) - return 0; - - if (interval.tv64 < timer->base->resolution.tv64) - interval.tv64 = timer->base->resolution.tv64; - - if (unlikely(delta.tv64 >= interval.tv64)) { - s64 incr = ktime_to_ns(interval); - - orun = ktime_divns(delta, incr); - hrtimer_add_expires_ns(timer, incr * orun); - if (hrtimer_get_expires_tv64(timer) > now.tv64) - return orun; - /* - * This (and the ktime_add() below) is the - * correction for exact: - */ - orun++; - } - hrtimer_add_expires(timer, interval); - - return orun; -} -EXPORT_SYMBOL_GPL(hrtimer_forward); - -/* - * enqueue_hrtimer - internal function to (re)start a timer - * - * The timer is inserted in expiry order. Insertion into the - * red black tree is O(log(n)). Must hold the base lock. - * - * Returns 1 when the new timer is the leftmost timer in the tree. - */ -static int enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - debug_activate(timer); - - timerqueue_add(&base->active, &timer->node); - base->cpu_base->active_bases |= 1 << base->index; - - /* - * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the - * state of a possibly running callback. - */ - timer->state |= HRTIMER_STATE_ENQUEUED; - - return (&timer->node == base->active.next); -} - -/* - * __remove_hrtimer - internal function to remove a timer - * - * Caller must hold the base lock. - * - * High resolution timer mode reprograms the clock event device when the - * timer is the one which expires next. The caller can disable this by setting - * reprogram to zero. This is useful, when the context does a reprogramming - * anyway (e.g. timer interrupt) - */ -static void __remove_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, - unsigned long newstate, int reprogram) -{ - struct timerqueue_node *next_timer; - if (!(timer->state & HRTIMER_STATE_ENQUEUED)) - goto out; - - next_timer = timerqueue_getnext(&base->active); - timerqueue_del(&base->active, &timer->node); - if (&timer->node == next_timer) { -#ifdef CONFIG_HIGH_RES_TIMERS - /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) { - ktime_t expires; - - expires = ktime_sub(hrtimer_get_expires(timer), - base->offset); - if (base->cpu_base->expires_next.tv64 == expires.tv64) - hrtimer_force_reprogram(base->cpu_base, 1); - } -#endif - } - if (!timerqueue_getnext(&base->active)) - base->cpu_base->active_bases &= ~(1 << base->index); -out: - timer->state = newstate; -} - -/* - * remove hrtimer, called with base lock held - */ -static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) -{ - if (hrtimer_is_queued(timer)) { - unsigned long state; - int reprogram; - - /* - * Remove the timer and force reprogramming when high - * resolution mode is active and the timer is on the current - * CPU. If we remove a timer on another CPU, reprogramming is - * skipped. The interrupt event on this CPU is fired and - * reprogramming happens in the interrupt handler. This is a - * rare case and less expensive than a smp call. - */ - debug_deactivate(timer); - timer_stats_hrtimer_clear_start_info(timer); - reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); - /* - * We must preserve the CALLBACK state flag here, - * otherwise we could move the timer base in - * switch_hrtimer_base. - */ - state = timer->state & HRTIMER_STATE_CALLBACK; - __remove_hrtimer(timer, base, state, reprogram); - return 1; - } - return 0; -} - -int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode, - int wakeup) -{ - struct hrtimer_clock_base *base, *new_base; - unsigned long flags; - int ret, leftmost; - - base = lock_hrtimer_base(timer, &flags); - - /* Remove an active timer from the queue: */ - ret = remove_hrtimer(timer, base); - - if (mode & HRTIMER_MODE_REL) { - tim = ktime_add_safe(tim, base->get_time()); - /* - * CONFIG_TIME_LOW_RES is a temporary way for architectures - * to signal that they simply return xtime in - * do_gettimeoffset(). In this case we want to round up by - * resolution when starting a relative timer, to avoid short - * timeouts. This will go away with the GTOD framework. - */ -#ifdef CONFIG_TIME_LOW_RES - tim = ktime_add_safe(tim, base->resolution); -#endif - } - - hrtimer_set_expires_range_ns(timer, tim, delta_ns); - - /* Switch the timer base, if necessary: */ - new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); - - timer_stats_hrtimer_set_start_info(timer); - - leftmost = enqueue_hrtimer(timer, new_base); - - /* - * Only allow reprogramming if the new base is on this CPU. - * (it might still be on another CPU if the timer was pending) - * - * XXX send_remote_softirq() ? - */ - if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) - && hrtimer_enqueue_reprogram(timer, new_base)) { - if (wakeup) { - /* - * We need to drop cpu_base->lock to avoid a - * lock ordering issue vs. rq->lock. - */ - raw_spin_unlock(&new_base->cpu_base->lock); - raise_softirq_irqoff(HRTIMER_SOFTIRQ); - local_irq_restore(flags); - return ret; - } else { - __raise_softirq_irqoff(HRTIMER_SOFTIRQ); - } - } - - unlock_hrtimer_base(timer, &flags); - - return ret; -} -EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns); - -/** - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) -{ - return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1); -} -EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); - -/** - * hrtimer_start - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int -hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) -{ - return __hrtimer_start_range_ns(timer, tim, 0, mode, 1); -} -EXPORT_SYMBOL_GPL(hrtimer_start); - - -/** - * hrtimer_try_to_cancel - try to deactivate a timer - * @timer: hrtimer to stop - * - * Returns: - * 0 when the timer was not active - * 1 when the timer was active - * -1 when the timer is currently excuting the callback function and - * cannot be stopped - */ -int hrtimer_try_to_cancel(struct hrtimer *timer) -{ - struct hrtimer_clock_base *base; - unsigned long flags; - int ret = -1; - - base = lock_hrtimer_base(timer, &flags); - - if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base); - - unlock_hrtimer_base(timer, &flags); - - return ret; - -} -EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); - -/** - * hrtimer_cancel - cancel a timer and wait for the handler to finish. - * @timer: the timer to be cancelled - * - * Returns: - * 0 when the timer was not active - * 1 when the timer was active - */ -int hrtimer_cancel(struct hrtimer *timer) -{ - for (;;) { - int ret = hrtimer_try_to_cancel(timer); - - if (ret >= 0) - return ret; - cpu_relax(); - } -} -EXPORT_SYMBOL_GPL(hrtimer_cancel); - -/** - * hrtimer_get_remaining - get remaining time for the timer - * @timer: the timer to read - */ -ktime_t hrtimer_get_remaining(const struct hrtimer *timer) -{ - unsigned long flags; - ktime_t rem; - - lock_hrtimer_base(timer, &flags); - rem = hrtimer_expires_remaining(timer); - unlock_hrtimer_base(timer, &flags); - - return rem; -} -EXPORT_SYMBOL_GPL(hrtimer_get_remaining); - -#ifdef CONFIG_NO_HZ_COMMON -/** - * hrtimer_get_next_event - get the time until next expiry event - * - * Returns the delta to the next expiry event or KTIME_MAX if no timer - * is pending. - */ -ktime_t hrtimer_get_next_event(void) -{ - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; - unsigned long flags; - int i; - - raw_spin_lock_irqsave(&cpu_base->lock, flags); - - if (!hrtimer_hres_active()) { - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { - struct hrtimer *timer; - struct timerqueue_node *next; - - next = timerqueue_getnext(&base->active); - if (!next) - continue; - - timer = container_of(next, struct hrtimer, node); - delta.tv64 = hrtimer_get_expires_tv64(timer); - delta = ktime_sub(delta, base->get_time()); - if (delta.tv64 < mindelta.tv64) - mindelta.tv64 = delta.tv64; - } - } - - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); - - if (mindelta.tv64 < 0) - mindelta.tv64 = 0; - return mindelta; -} -#endif - -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - struct hrtimer_cpu_base *cpu_base; - int base; - - memset(timer, 0, sizeof(struct hrtimer)); - - cpu_base = &__raw_get_cpu_var(hrtimer_bases); - - if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) - clock_id = CLOCK_MONOTONIC; - - base = hrtimer_clockid_to_base(clock_id); - timer->base = &cpu_base->clock_base[base]; - timerqueue_init(&timer->node); - -#ifdef CONFIG_TIMER_STATS - timer->start_site = NULL; - timer->start_pid = -1; - memset(timer->start_comm, 0, TASK_COMM_LEN); -#endif -} - -/** - * hrtimer_init - initialize a timer to the given clock - * @timer: the timer to be initialized - * @clock_id: the clock to be used - * @mode: timer mode abs/rel - */ -void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - debug_init(timer, clock_id, mode); - __hrtimer_init(timer, clock_id, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_init); - -/** - * hrtimer_get_res - get the timer resolution for a clock - * @which_clock: which clock to query - * @tp: pointer to timespec variable to store the resolution - * - * Store the resolution of the clock selected by @which_clock in the - * variable pointed to by @tp. - */ -int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) -{ - struct hrtimer_cpu_base *cpu_base; - int base = hrtimer_clockid_to_base(which_clock); - - cpu_base = &__raw_get_cpu_var(hrtimer_bases); - *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); - - return 0; -} -EXPORT_SYMBOL_GPL(hrtimer_get_res); - -static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) -{ - struct hrtimer_clock_base *base = timer->base; - struct hrtimer_cpu_base *cpu_base = base->cpu_base; - enum hrtimer_restart (*fn)(struct hrtimer *); - int restart; - - WARN_ON(!irqs_disabled()); - - debug_deactivate(timer); - __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); - timer_stats_account_hrtimer(timer); - fn = timer->function; - - /* - * Because we run timers from hardirq context, there is no chance - * they get migrated to another cpu, therefore its safe to unlock - * the timer base. - */ - raw_spin_unlock(&cpu_base->lock); - trace_hrtimer_expire_entry(timer, now); - restart = fn(timer); - trace_hrtimer_expire_exit(timer); - raw_spin_lock(&cpu_base->lock); - - /* - * Note: We clear the CALLBACK bit after enqueue_hrtimer and - * we do not reprogramm the event hardware. Happens either in - * hrtimer_start_range_ns() or in hrtimer_interrupt() - */ - if (restart != HRTIMER_NORESTART) { - BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); - enqueue_hrtimer(timer, base); - } - - WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); - - timer->state &= ~HRTIMER_STATE_CALLBACK; -} - -#ifdef CONFIG_HIGH_RES_TIMERS - -/* - * High resolution timer interrupt - * Called with interrupts disabled - */ -void hrtimer_interrupt(struct clock_event_device *dev) -{ - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - ktime_t expires_next, now, entry_time, delta; - int i, retries = 0; - - BUG_ON(!cpu_base->hres_active); - cpu_base->nr_events++; - dev->next_event.tv64 = KTIME_MAX; - - raw_spin_lock(&cpu_base->lock); - entry_time = now = hrtimer_update_base(cpu_base); -retry: - expires_next.tv64 = KTIME_MAX; - /* - * We set expires_next to KTIME_MAX here with cpu_base->lock - * held to prevent that a timer is enqueued in our queue via - * the migration code. This does not affect enqueueing of - * timers which run their callback and need to be requeued on - * this CPU. - */ - cpu_base->expires_next.tv64 = KTIME_MAX; - - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - struct hrtimer_clock_base *base; - struct timerqueue_node *node; - ktime_t basenow; - - if (!(cpu_base->active_bases & (1 << i))) - continue; - - base = cpu_base->clock_base + i; - basenow = ktime_add(now, base->offset); - - while ((node = timerqueue_getnext(&base->active))) { - struct hrtimer *timer; - - timer = container_of(node, struct hrtimer, node); - - /* - * The immediate goal for using the softexpires is - * minimizing wakeups, not running timers at the - * earliest interrupt after their soft expiration. - * This allows us to avoid using a Priority Search - * Tree, which can answer a stabbing querry for - * overlapping intervals and instead use the simple - * BST we already have. - * We don't add extra wakeups by delaying timers that - * are right-of a not yet expired timer, because that - * timer will have to trigger a wakeup anyway. - */ - - if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) { - ktime_t expires; - - expires = ktime_sub(hrtimer_get_expires(timer), - base->offset); - if (expires.tv64 < 0) - expires.tv64 = KTIME_MAX; - if (expires.tv64 < expires_next.tv64) - expires_next = expires; - break; - } - - __run_hrtimer(timer, &basenow); - } - } - - /* - * Store the new expiry value so the migration code can verify - * against it. - */ - cpu_base->expires_next = expires_next; - raw_spin_unlock(&cpu_base->lock); - - /* Reprogramming necessary ? */ - if (expires_next.tv64 == KTIME_MAX || - !tick_program_event(expires_next, 0)) { - cpu_base->hang_detected = 0; - return; - } - - /* - * The next timer was already expired due to: - * - tracing - * - long lasting callbacks - * - being scheduled away when running in a VM - * - * We need to prevent that we loop forever in the hrtimer - * interrupt routine. We give it 3 attempts to avoid - * overreacting on some spurious event. - * - * Acquire base lock for updating the offsets and retrieving - * the current time. - */ - raw_spin_lock(&cpu_base->lock); - now = hrtimer_update_base(cpu_base); - cpu_base->nr_retries++; - if (++retries < 3) - goto retry; - /* - * Give the system a chance to do something else than looping - * here. We stored the entry time, so we know exactly how long - * we spent here. We schedule the next event this amount of - * time away. - */ - cpu_base->nr_hangs++; - cpu_base->hang_detected = 1; - raw_spin_unlock(&cpu_base->lock); - delta = ktime_sub(now, entry_time); - if (delta.tv64 > cpu_base->max_hang_time.tv64) - cpu_base->max_hang_time = delta; - /* - * Limit it to a sensible value as we enforce a longer - * delay. Give the CPU at least 100ms to catch up. - */ - if (delta.tv64 > 100 * NSEC_PER_MSEC) - expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); - else - expires_next = ktime_add(now, delta); - tick_program_event(expires_next, 1); - printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n", - ktime_to_ns(delta)); -} - -/* - * local version of hrtimer_peek_ahead_timers() called with interrupts - * disabled. - */ -static void __hrtimer_peek_ahead_timers(void) -{ - struct tick_device *td; - - if (!hrtimer_hres_active()) - return; - - td = &__get_cpu_var(tick_cpu_device); - if (td && td->evtdev) - hrtimer_interrupt(td->evtdev); -} - -/** - * hrtimer_peek_ahead_timers -- run soft-expired timers now - * - * hrtimer_peek_ahead_timers will peek at the timer queue of - * the current cpu and check if there are any timers for which - * the soft expires time has passed. If any such timers exist, - * they are run immediately and then removed from the timer queue. - * - */ -void hrtimer_peek_ahead_timers(void) -{ - unsigned long flags; - - local_irq_save(flags); - __hrtimer_peek_ahead_timers(); - local_irq_restore(flags); -} - -static void run_hrtimer_softirq(struct softirq_action *h) -{ - hrtimer_peek_ahead_timers(); -} - -#else /* CONFIG_HIGH_RES_TIMERS */ - -static inline void __hrtimer_peek_ahead_timers(void) { } - -#endif /* !CONFIG_HIGH_RES_TIMERS */ - -/* - * Called from timer softirq every jiffy, expire hrtimers: - * - * For HRT its the fall back code to run the softirq in the timer - * softirq context in case the hrtimer initialization failed or has - * not been done yet. - */ -void hrtimer_run_pending(void) -{ - if (hrtimer_hres_active()) - return; - - /* - * This _is_ ugly: We have to check in the softirq context, - * whether we can switch to highres and / or nohz mode. The - * clocksource switch happens in the timer interrupt with - * xtime_lock held. Notification from there only sets the - * check bit in the tick_oneshot code, otherwise we might - * deadlock vs. xtime_lock. - */ - if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) - hrtimer_switch_to_hres(); -} - -/* - * Called from hardirq context every jiffy - */ -void hrtimer_run_queues(void) -{ - struct timerqueue_node *node; - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - struct hrtimer_clock_base *base; - int index, gettime = 1; - - if (hrtimer_hres_active()) - return; - - for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { - base = &cpu_base->clock_base[index]; - if (!timerqueue_getnext(&base->active)) - continue; - - if (gettime) { - hrtimer_get_softirq_time(cpu_base); - gettime = 0; - } - - raw_spin_lock(&cpu_base->lock); - - while ((node = timerqueue_getnext(&base->active))) { - struct hrtimer *timer; - - timer = container_of(node, struct hrtimer, node); - if (base->softirq_time.tv64 <= - hrtimer_get_expires_tv64(timer)) - break; - - __run_hrtimer(timer, &base->softirq_time); - } - raw_spin_unlock(&cpu_base->lock); - } -} - -/* - * Sleep related functions: - */ -static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) -{ - struct hrtimer_sleeper *t = - container_of(timer, struct hrtimer_sleeper, timer); - struct task_struct *task = t->task; - - t->task = NULL; - if (task) - wake_up_process(task); - - return HRTIMER_NORESTART; -} - -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) -{ - sl->timer.function = hrtimer_wakeup; - sl->task = task; -} -EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); - -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) -{ - hrtimer_init_sleeper(t, current); - - do { - set_current_state(TASK_INTERRUPTIBLE); - hrtimer_start_expires(&t->timer, mode); - if (!hrtimer_active(&t->timer)) - t->task = NULL; - - if (likely(t->task)) - freezable_schedule(); - - hrtimer_cancel(&t->timer); - mode = HRTIMER_MODE_ABS; - - } while (t->task && !signal_pending(current)); - - __set_current_state(TASK_RUNNING); - - return t->task == NULL; -} - -static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp) -{ - struct timespec rmt; - ktime_t rem; - - rem = hrtimer_expires_remaining(timer); - if (rem.tv64 <= 0) - return 0; - rmt = ktime_to_timespec(rem); - - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) - return -EFAULT; - - return 1; -} - -long __sched hrtimer_nanosleep_restart(struct restart_block *restart) -{ - struct hrtimer_sleeper t; - struct timespec __user *rmtp; - int ret = 0; - - hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, - HRTIMER_MODE_ABS); - hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); - - if (do_nanosleep(&t, HRTIMER_MODE_ABS)) - goto out; - - rmtp = restart->nanosleep.rmtp; - if (rmtp) { - ret = update_rmtp(&t.timer, rmtp); - if (ret <= 0) - goto out; - } - - /* The other values in restart are already filled in */ - ret = -ERESTART_RESTARTBLOCK; -out: - destroy_hrtimer_on_stack(&t.timer); - return ret; -} - -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, - const enum hrtimer_mode mode, const clockid_t clockid) -{ - struct restart_block *restart; - struct hrtimer_sleeper t; - int ret = 0; - unsigned long slack; - - slack = current->timer_slack_ns; - if (dl_task(current) || rt_task(current)) - slack = 0; - - hrtimer_init_on_stack(&t.timer, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack); - if (do_nanosleep(&t, mode)) - goto out; - - /* Absolute timers do not update the rmtp value and restart: */ - if (mode == HRTIMER_MODE_ABS) { - ret = -ERESTARTNOHAND; - goto out; - } - - if (rmtp) { - ret = update_rmtp(&t.timer, rmtp); - if (ret <= 0) - goto out; - } - - restart = ¤t_thread_info()->restart_block; - restart->fn = hrtimer_nanosleep_restart; - restart->nanosleep.clockid = t.timer.base->clockid; - restart->nanosleep.rmtp = rmtp; - restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); - - ret = -ERESTART_RESTARTBLOCK; -out: - destroy_hrtimer_on_stack(&t.timer); - return ret; -} - -SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, - struct timespec __user *, rmtp) -{ - struct timespec tu; - - if (copy_from_user(&tu, rqtp, sizeof(tu))) - return -EFAULT; - - if (!timespec_valid(&tu)) - return -EINVAL; - - return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); -} - -/* - * Functions related to boot-time initialization: - */ -static void init_hrtimers_cpu(int cpu) -{ - struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); - int i; - - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - cpu_base->clock_base[i].cpu_base = cpu_base; - timerqueue_init_head(&cpu_base->clock_base[i].active); - } - - hrtimer_init_hres(cpu_base); -} - -#ifdef CONFIG_HOTPLUG_CPU - -static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base) -{ - struct hrtimer *timer; - struct timerqueue_node *node; - - while ((node = timerqueue_getnext(&old_base->active))) { - timer = container_of(node, struct hrtimer, node); - BUG_ON(hrtimer_callback_running(timer)); - debug_deactivate(timer); - - /* - * Mark it as STATE_MIGRATE not INACTIVE otherwise the - * timer could be seen as !active and just vanish away - * under us on another CPU - */ - __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); - timer->base = new_base; - /* - * Enqueue the timers on the new cpu. This does not - * reprogram the event device in case the timer - * expires before the earliest on this CPU, but we run - * hrtimer_interrupt after we migrated everything to - * sort out already expired timers and reprogram the - * event device. - */ - enqueue_hrtimer(timer, new_base); - - /* Clear the migration state bit */ - timer->state &= ~HRTIMER_STATE_MIGRATE; - } -} - -static void migrate_hrtimers(int scpu) -{ - struct hrtimer_cpu_base *old_base, *new_base; - int i; - - BUG_ON(cpu_online(scpu)); - tick_cancel_sched_timer(scpu); - - local_irq_disable(); - old_base = &per_cpu(hrtimer_bases, scpu); - new_base = &__get_cpu_var(hrtimer_bases); - /* - * The caller is globally serialized and nobody else - * takes two locks at once, deadlock is not possible. - */ - raw_spin_lock(&new_base->lock); - raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); - } - - raw_spin_unlock(&old_base->lock); - raw_spin_unlock(&new_base->lock); - - /* Check, if we got expired work to do */ - __hrtimer_peek_ahead_timers(); - local_irq_enable(); -} - -#endif /* CONFIG_HOTPLUG_CPU */ - -static int hrtimer_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - int scpu = (long)hcpu; - - switch (action) { - - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - init_hrtimers_cpu(scpu); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DYING: - case CPU_DYING_FROZEN: - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - { - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); - migrate_hrtimers(scpu); - break; - } -#endif - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block hrtimers_nb = { - .notifier_call = hrtimer_cpu_notify, -}; - -void __init hrtimers_init(void) -{ - hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - register_cpu_notifier(&hrtimers_nb); -#ifdef CONFIG_HIGH_RES_TIMERS - open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); -#endif -} - -/** - * schedule_hrtimeout_range_clock - sleep until timeout - * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL - * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME - */ -int __sched -schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, - const enum hrtimer_mode mode, int clock) -{ - struct hrtimer_sleeper t; - - /* - * Optimize when a zero timeout value is given. It does not - * matter whether this is an absolute or a relative time. - */ - if (expires && !expires->tv64) { - __set_current_state(TASK_RUNNING); - return 0; - } - - /* - * A NULL parameter means "infinite" - */ - if (!expires) { - schedule(); - __set_current_state(TASK_RUNNING); - return -EINTR; - } - - hrtimer_init_on_stack(&t.timer, clock, mode); - hrtimer_set_expires_range_ns(&t.timer, *expires, delta); - - hrtimer_init_sleeper(&t, current); - - hrtimer_start_expires(&t.timer, mode); - if (!hrtimer_active(&t.timer)) - t.task = NULL; - - if (likely(t.task)) - schedule(); - - hrtimer_cancel(&t.timer); - destroy_hrtimer_on_stack(&t.timer); - - __set_current_state(TASK_RUNNING); - - return !t.task ? 0 : -EINTR; -} - -/** - * schedule_hrtimeout_range - sleep until timeout - * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL - * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * The @delta argument gives the kernel the freedom to schedule the - * actual wakeup to a time that is both power and performance friendly. - * The kernel give the normal best effort behavior for "@expires+@delta", - * but may decide to fire the timer earlier, but no earlier than @expires. - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns. - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns 0 when the timer has expired otherwise -EINTR - */ -int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, - const enum hrtimer_mode mode) -{ - return schedule_hrtimeout_range_clock(expires, delta, mode, - CLOCK_MONOTONIC); -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); - -/** - * schedule_hrtimeout - sleep until timeout - * @expires: timeout value (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL - * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns. - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns 0 when the timer has expired otherwise -EINTR - */ -int __sched schedule_hrtimeout(ktime_t *expires, - const enum hrtimer_mode mode) -{ - return schedule_hrtimeout_range(expires, 0, mode); -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout); diff --git a/kernel/itimer.c b/kernel/itimer.c deleted file mode 100644 index 8d262b4..0000000 --- a/kernel/itimer.c +++ /dev/null @@ -1,301 +0,0 @@ -/* - * linux/kernel/itimer.c - * - * Copyright (C) 1992 Darren Senn - */ - -/* These are all the functions necessary to implement itimers */ - -#include -#include -#include -#include -#include -#include -#include - -#include - -/** - * itimer_get_remtime - get remaining time for the timer - * - * @timer: the timer to read - * - * Returns the delta between the expiry time and now, which can be - * less than zero or 1usec for an pending expired timer - */ -static struct timeval itimer_get_remtime(struct hrtimer *timer) -{ - ktime_t rem = hrtimer_get_remaining(timer); - - /* - * Racy but safe: if the itimer expires after the above - * hrtimer_get_remtime() call but before this condition - * then we return 0 - which is correct. - */ - if (hrtimer_active(timer)) { - if (rem.tv64 <= 0) - rem.tv64 = NSEC_PER_USEC; - } else - rem.tv64 = 0; - - return ktime_to_timeval(rem); -} - -static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, - struct itimerval *const value) -{ - cputime_t cval, cinterval; - struct cpu_itimer *it = &tsk->signal->it[clock_id]; - - spin_lock_irq(&tsk->sighand->siglock); - - cval = it->expires; - cinterval = it->incr; - if (cval) { - struct task_cputime cputime; - cputime_t t; - - thread_group_cputimer(tsk, &cputime); - if (clock_id == CPUCLOCK_PROF) - t = cputime.utime + cputime.stime; - else - /* CPUCLOCK_VIRT */ - t = cputime.utime; - - if (cval < t) - /* about to fire */ - cval = cputime_one_jiffy; - else - cval = cval - t; - } - - spin_unlock_irq(&tsk->sighand->siglock); - - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); -} - -int do_getitimer(int which, struct itimerval *value) -{ - struct task_struct *tsk = current; - - switch (which) { - case ITIMER_REAL: - spin_lock_irq(&tsk->sighand->siglock); - value->it_value = itimer_get_remtime(&tsk->signal->real_timer); - value->it_interval = - ktime_to_timeval(tsk->signal->it_real_incr); - spin_unlock_irq(&tsk->sighand->siglock); - break; - case ITIMER_VIRTUAL: - get_cpu_itimer(tsk, CPUCLOCK_VIRT, value); - break; - case ITIMER_PROF: - get_cpu_itimer(tsk, CPUCLOCK_PROF, value); - break; - default: - return(-EINVAL); - } - return 0; -} - -SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) -{ - int error = -EFAULT; - struct itimerval get_buffer; - - if (value) { - error = do_getitimer(which, &get_buffer); - if (!error && - copy_to_user(value, &get_buffer, sizeof(get_buffer))) - error = -EFAULT; - } - return error; -} - - -/* - * The timer is automagically restarted, when interval != 0 - */ -enum hrtimer_restart it_real_fn(struct hrtimer *timer) -{ - struct signal_struct *sig = - container_of(timer, struct signal_struct, real_timer); - - trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); - kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); - - return HRTIMER_NORESTART; -} - -static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns) -{ - struct timespec ts; - s64 cpu_ns; - - cputime_to_timespec(ct, &ts); - cpu_ns = timespec_to_ns(&ts); - - return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns; -} - -static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, - const struct itimerval *const value, - struct itimerval *const ovalue) -{ - cputime_t cval, nval, cinterval, ninterval; - s64 ns_ninterval, ns_nval; - u32 error, incr_error; - struct cpu_itimer *it = &tsk->signal->it[clock_id]; - - nval = timeval_to_cputime(&value->it_value); - ns_nval = timeval_to_ns(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - ns_ninterval = timeval_to_ns(&value->it_interval); - - error = cputime_sub_ns(nval, ns_nval); - incr_error = cputime_sub_ns(ninterval, ns_ninterval); - - spin_lock_irq(&tsk->sighand->siglock); - - cval = it->expires; - cinterval = it->incr; - if (cval || nval) { - if (nval > 0) - nval += cputime_one_jiffy; - set_process_cpu_timer(tsk, clock_id, &nval, &cval); - } - it->expires = nval; - it->incr = ninterval; - it->error = error; - it->incr_error = incr_error; - trace_itimer_state(clock_id == CPUCLOCK_VIRT ? - ITIMER_VIRTUAL : ITIMER_PROF, value, nval); - - spin_unlock_irq(&tsk->sighand->siglock); - - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } -} - -/* - * Returns true if the timeval is in canonical form - */ -#define timeval_valid(t) \ - (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) - -int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) -{ - struct task_struct *tsk = current; - struct hrtimer *timer; - ktime_t expires; - - /* - * Validate the timevals in value. - */ - if (!timeval_valid(&value->it_value) || - !timeval_valid(&value->it_interval)) - return -EINVAL; - - switch (which) { - case ITIMER_REAL: -again: - spin_lock_irq(&tsk->sighand->siglock); - timer = &tsk->signal->real_timer; - if (ovalue) { - ovalue->it_value = itimer_get_remtime(timer); - ovalue->it_interval - = ktime_to_timeval(tsk->signal->it_real_incr); - } - /* We are sharing ->siglock with it_real_fn() */ - if (hrtimer_try_to_cancel(timer) < 0) { - spin_unlock_irq(&tsk->sighand->siglock); - goto again; - } - expires = timeval_to_ktime(value->it_value); - if (expires.tv64 != 0) { - tsk->signal->it_real_incr = - timeval_to_ktime(value->it_interval); - hrtimer_start(timer, expires, HRTIMER_MODE_REL); - } else - tsk->signal->it_real_incr.tv64 = 0; - - trace_itimer_state(ITIMER_REAL, value, 0); - spin_unlock_irq(&tsk->sighand->siglock); - break; - case ITIMER_VIRTUAL: - set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue); - break; - case ITIMER_PROF: - set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue); - break; - default: - return -EINVAL; - } - return 0; -} - -/** - * alarm_setitimer - set alarm in seconds - * - * @seconds: number of seconds until alarm - * 0 disables the alarm - * - * Returns the remaining time in seconds of a pending timer or 0 when - * the timer is not active. - * - * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid - * negative timeval settings which would cause immediate expiry. - */ -unsigned int alarm_setitimer(unsigned int seconds) -{ - struct itimerval it_new, it_old; - -#if BITS_PER_LONG < 64 - if (seconds > INT_MAX) - seconds = INT_MAX; -#endif - it_new.it_value.tv_sec = seconds; - it_new.it_value.tv_usec = 0; - it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; - - do_setitimer(ITIMER_REAL, &it_new, &it_old); - - /* - * We can't return 0 if we have an alarm pending ... And we'd - * better return too much than too little anyway - */ - if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) || - it_old.it_value.tv_usec >= 500000) - it_old.it_value.tv_sec++; - - return it_old.it_value.tv_sec; -} - -SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, - struct itimerval __user *, ovalue) -{ - struct itimerval set_buffer, get_buffer; - int error; - - if (value) { - if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) - return -EFAULT; - } else { - memset(&set_buffer, 0, sizeof(set_buffer)); - printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." - " Misfeature support will be removed\n", - current->comm); - } - - error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); - if (error || !ovalue) - return error; - - if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer))) - return -EFAULT; - return 0; -} diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c deleted file mode 100644 index 3b89464..0000000 --- a/kernel/posix-cpu-timers.c +++ /dev/null @@ -1,1490 +0,0 @@ -/* - * Implement CPU time clocks for the POSIX clock interface. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Called after updating RLIMIT_CPU to run cpu timer and update - * tsk->signal->cputime_expires expiration cache if necessary. Needs - * siglock protection since other code may update expiration cache as - * well. - */ -void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) -{ - cputime_t cputime = secs_to_cputime(rlim_new); - - spin_lock_irq(&task->sighand->siglock); - set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL); - spin_unlock_irq(&task->sighand->siglock); -} - -static int check_clock(const clockid_t which_clock) -{ - int error = 0; - struct task_struct *p; - const pid_t pid = CPUCLOCK_PID(which_clock); - - if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX) - return -EINVAL; - - if (pid == 0) - return 0; - - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? - same_thread_group(p, current) : has_group_leader_pid(p))) { - error = -EINVAL; - } - rcu_read_unlock(); - - return error; -} - -static inline unsigned long long -timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) -{ - unsigned long long ret; - - ret = 0; /* high half always zero when .cpu used */ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; - } else { - ret = cputime_to_expires(timespec_to_cputime(tp)); - } - return ret; -} - -static void sample_to_timespec(const clockid_t which_clock, - unsigned long long expires, - struct timespec *tp) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) - *tp = ns_to_timespec(expires); - else - cputime_to_timespec((__force cputime_t)expires, tp); -} - -/* - * Update expiry time from increment, and increase overrun count, - * given the current clock sample. - */ -static void bump_cpu_timer(struct k_itimer *timer, - unsigned long long now) -{ - int i; - unsigned long long delta, incr; - - if (timer->it.cpu.incr == 0) - return; - - if (now < timer->it.cpu.expires) - return; - - incr = timer->it.cpu.incr; - delta = now + incr - timer->it.cpu.expires; - - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr = incr << 1; - - for (; i >= 0; incr >>= 1, i--) { - if (delta < incr) - continue; - - timer->it.cpu.expires += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } -} - -/** - * task_cputime_zero - Check a task_cputime struct for all zero fields. - * - * @cputime: The struct to compare. - * - * Checks @cputime to see if all fields are zero. Returns true if all fields - * are zero, false if any field is nonzero. - */ -static inline int task_cputime_zero(const struct task_cputime *cputime) -{ - if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) - return 1; - return 0; -} - -static inline unsigned long long prof_ticks(struct task_struct *p) -{ - cputime_t utime, stime; - - task_cputime(p, &utime, &stime); - - return cputime_to_expires(utime + stime); -} -static inline unsigned long long virt_ticks(struct task_struct *p) -{ - cputime_t utime; - - task_cputime(p, &utime, NULL); - - return cputime_to_expires(utime); -} - -static int -posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) -{ - int error = check_clock(which_clock); - if (!error) { - tp->tv_sec = 0; - tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ); - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - /* - * If sched_clock is using a cycle counter, we - * don't have any idea of its true resolution - * exported, but it is much more than 1s/HZ. - */ - tp->tv_nsec = 1; - } - } - return error; -} - -static int -posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) -{ - /* - * You can never reset a CPU clock, but we check for other errors - * in the call before failing with EPERM. - */ - int error = check_clock(which_clock); - if (error == 0) { - error = -EPERM; - } - return error; -} - - -/* - * Sample a per-thread clock for the given task. - */ -static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, - unsigned long long *sample) -{ - switch (CPUCLOCK_WHICH(which_clock)) { - default: - return -EINVAL; - case CPUCLOCK_PROF: - *sample = prof_ticks(p); - break; - case CPUCLOCK_VIRT: - *sample = virt_ticks(p); - break; - case CPUCLOCK_SCHED: - *sample = task_sched_runtime(p); - break; - } - return 0; -} - -static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) -{ - if (b->utime > a->utime) - a->utime = b->utime; - - if (b->stime > a->stime) - a->stime = b->stime; - - if (b->sum_exec_runtime > a->sum_exec_runtime) - a->sum_exec_runtime = b->sum_exec_runtime; -} - -void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) -{ - struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - struct task_cputime sum; - unsigned long flags; - - if (!cputimer->running) { - /* - * The POSIX timer interface allows for absolute time expiry - * values through the TIMER_ABSTIME flag, therefore we have - * to synchronize the timer to the clock every time we start - * it. - */ - thread_group_cputime(tsk, &sum); - raw_spin_lock_irqsave(&cputimer->lock, flags); - cputimer->running = 1; - update_gt_cputime(&cputimer->cputime, &sum); - } else - raw_spin_lock_irqsave(&cputimer->lock, flags); - *times = cputimer->cputime; - raw_spin_unlock_irqrestore(&cputimer->lock, flags); -} - -/* - * Sample a process (thread group) clock for the given group_leader task. - * Must be called with task sighand lock held for safe while_each_thread() - * traversal. - */ -static int cpu_clock_sample_group(const clockid_t which_clock, - struct task_struct *p, - unsigned long long *sample) -{ - struct task_cputime cputime; - - switch (CPUCLOCK_WHICH(which_clock)) { - default: - return -EINVAL; - case CPUCLOCK_PROF: - thread_group_cputime(p, &cputime); - *sample = cputime_to_expires(cputime.utime + cputime.stime); - break; - case CPUCLOCK_VIRT: - thread_group_cputime(p, &cputime); - *sample = cputime_to_expires(cputime.utime); - break; - case CPUCLOCK_SCHED: - thread_group_cputime(p, &cputime); - *sample = cputime.sum_exec_runtime; - break; - } - return 0; -} - -static int posix_cpu_clock_get_task(struct task_struct *tsk, - const clockid_t which_clock, - struct timespec *tp) -{ - int err = -EINVAL; - unsigned long long rtn; - - if (CPUCLOCK_PERTHREAD(which_clock)) { - if (same_thread_group(tsk, current)) - err = cpu_clock_sample(which_clock, tsk, &rtn); - } else { - unsigned long flags; - struct sighand_struct *sighand; - - /* - * while_each_thread() is not yet entirely RCU safe, - * keep locking the group while sampling process - * clock for now. - */ - sighand = lock_task_sighand(tsk, &flags); - if (!sighand) - return err; - - if (tsk == current || thread_group_leader(tsk)) - err = cpu_clock_sample_group(which_clock, tsk, &rtn); - - unlock_task_sighand(tsk, &flags); - } - - if (!err) - sample_to_timespec(which_clock, rtn, tp); - - return err; -} - - -static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) -{ - const pid_t pid = CPUCLOCK_PID(which_clock); - int err = -EINVAL; - - if (pid == 0) { - /* - * Special case constant value for our own clocks. - * We don't have to do any lookup to find ourselves. - */ - err = posix_cpu_clock_get_task(current, which_clock, tp); - } else { - /* - * Find the given PID, and validate that the caller - * should be able to see it. - */ - struct task_struct *p; - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (p) - err = posix_cpu_clock_get_task(p, which_clock, tp); - rcu_read_unlock(); - } - - return err; -} - - -/* - * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. - * This is called from sys_timer_create() and do_cpu_nanosleep() with the - * new timer already all-zeros initialized. - */ -static int posix_cpu_timer_create(struct k_itimer *new_timer) -{ - int ret = 0; - const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); - struct task_struct *p; - - if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) - return -EINVAL; - - INIT_LIST_HEAD(&new_timer->it.cpu.entry); - - rcu_read_lock(); - if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { - if (pid == 0) { - p = current; - } else { - p = find_task_by_vpid(pid); - if (p && !same_thread_group(p, current)) - p = NULL; - } - } else { - if (pid == 0) { - p = current->group_leader; - } else { - p = find_task_by_vpid(pid); - if (p && !has_group_leader_pid(p)) - p = NULL; - } - } - new_timer->it.cpu.task = p; - if (p) { - get_task_struct(p); - } else { - ret = -EINVAL; - } - rcu_read_unlock(); - - return ret; -} - -/* - * Clean up a CPU-clock timer that is about to be destroyed. - * This is called from timer deletion with the timer already locked. - * If we return TIMER_RETRY, it's necessary to release the timer's lock - * and try again. (This happens when the timer is in the middle of firing.) - */ -static int posix_cpu_timer_del(struct k_itimer *timer) -{ - int ret = 0; - unsigned long flags; - struct sighand_struct *sighand; - struct task_struct *p = timer->it.cpu.task; - - WARN_ON_ONCE(p == NULL); - - /* - * Protect against sighand release/switch in exit/exec and process/ - * thread timer list entry concurrent read/writes. - */ - sighand = lock_task_sighand(p, &flags); - if (unlikely(sighand == NULL)) { - /* - * We raced with the reaping of the task. - * The deletion should have cleared us off the list. - */ - WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry)); - } else { - if (timer->it.cpu.firing) - ret = TIMER_RETRY; - else - list_del(&timer->it.cpu.entry); - - unlock_task_sighand(p, &flags); - } - - if (!ret) - put_task_struct(p); - - return ret; -} - -static void cleanup_timers_list(struct list_head *head) -{ - struct cpu_timer_list *timer, *next; - - list_for_each_entry_safe(timer, next, head, entry) - list_del_init(&timer->entry); -} - -/* - * Clean out CPU timers still ticking when a thread exited. The task - * pointer is cleared, and the expiry time is replaced with the residual - * time for later timer_gettime calls to return. - * This must be called with the siglock held. - */ -static void cleanup_timers(struct list_head *head) -{ - cleanup_timers_list(head); - cleanup_timers_list(++head); - cleanup_timers_list(++head); -} - -/* - * These are both called with the siglock held, when the current thread - * is being reaped. When the final (leader) thread in the group is reaped, - * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit. - */ -void posix_cpu_timers_exit(struct task_struct *tsk) -{ - add_device_randomness((const void*) &tsk->se.sum_exec_runtime, - sizeof(unsigned long long)); - cleanup_timers(tsk->cpu_timers); - -} -void posix_cpu_timers_exit_group(struct task_struct *tsk) -{ - cleanup_timers(tsk->signal->cpu_timers); -} - -static inline int expires_gt(cputime_t expires, cputime_t new_exp) -{ - return expires == 0 || expires > new_exp; -} - -/* - * Insert the timer on the appropriate list before any timers that - * expire later. This must be called with the sighand lock held. - */ -static void arm_timer(struct k_itimer *timer) -{ - struct task_struct *p = timer->it.cpu.task; - struct list_head *head, *listpos; - struct task_cputime *cputime_expires; - struct cpu_timer_list *const nt = &timer->it.cpu; - struct cpu_timer_list *next; - - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - head = p->cpu_timers; - cputime_expires = &p->cputime_expires; - } else { - head = p->signal->cpu_timers; - cputime_expires = &p->signal->cputime_expires; - } - head += CPUCLOCK_WHICH(timer->it_clock); - - listpos = head; - list_for_each_entry(next, head, entry) { - if (nt->expires < next->expires) - break; - listpos = &next->entry; - } - list_add(&nt->entry, listpos); - - if (listpos == head) { - unsigned long long exp = nt->expires; - - /* - * We are the new earliest-expiring POSIX 1.b timer, hence - * need to update expiration cache. Take into account that - * for process timers we share expiration cache with itimers - * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. - */ - - switch (CPUCLOCK_WHICH(timer->it_clock)) { - case CPUCLOCK_PROF: - if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp))) - cputime_expires->prof_exp = expires_to_cputime(exp); - break; - case CPUCLOCK_VIRT: - if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp))) - cputime_expires->virt_exp = expires_to_cputime(exp); - break; - case CPUCLOCK_SCHED: - if (cputime_expires->sched_exp == 0 || - cputime_expires->sched_exp > exp) - cputime_expires->sched_exp = exp; - break; - } - } -} - -/* - * The timer is locked, fire it and arrange for its reload. - */ -static void cpu_timer_fire(struct k_itimer *timer) -{ - if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { - /* - * User don't want any signal. - */ - timer->it.cpu.expires = 0; - } else if (unlikely(timer->sigq == NULL)) { - /* - * This a special case for clock_nanosleep, - * not a normal timer from sys_timer_create. - */ - wake_up_process(timer->it_process); - timer->it.cpu.expires = 0; - } else if (timer->it.cpu.incr == 0) { - /* - * One-shot timer. Clear it as soon as it's fired. - */ - posix_timer_event(timer, 0); - timer->it.cpu.expires = 0; - } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { - /* - * The signal did not get queued because the signal - * was ignored, so we won't get any callback to - * reload the timer. But we need to keep it - * ticking in case the signal is deliverable next time. - */ - posix_cpu_timer_schedule(timer); - } -} - -/* - * Sample a process (thread group) timer for the given group_leader task. - * Must be called with task sighand lock held for safe while_each_thread() - * traversal. - */ -static int cpu_timer_sample_group(const clockid_t which_clock, - struct task_struct *p, - unsigned long long *sample) -{ - struct task_cputime cputime; - - thread_group_cputimer(p, &cputime); - switch (CPUCLOCK_WHICH(which_clock)) { - default: - return -EINVAL; - case CPUCLOCK_PROF: - *sample = cputime_to_expires(cputime.utime + cputime.stime); - break; - case CPUCLOCK_VIRT: - *sample = cputime_to_expires(cputime.utime); - break; - case CPUCLOCK_SCHED: - *sample = cputime.sum_exec_runtime + task_delta_exec(p); - break; - } - return 0; -} - -#ifdef CONFIG_NO_HZ_FULL -static void nohz_kick_work_fn(struct work_struct *work) -{ - tick_nohz_full_kick_all(); -} - -static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); - -/* - * We need the IPIs to be sent from sane process context. - * The posix cpu timers are always set with irqs disabled. - */ -static void posix_cpu_timer_kick_nohz(void) -{ - if (context_tracking_is_enabled()) - schedule_work(&nohz_kick_work); -} - -bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) -{ - if (!task_cputime_zero(&tsk->cputime_expires)) - return false; - - if (tsk->signal->cputimer.running) - return false; - - return true; -} -#else -static inline void posix_cpu_timer_kick_nohz(void) { } -#endif - -/* - * Guts of sys_timer_settime for CPU timers. - * This is called with the timer locked and interrupts disabled. - * If we return TIMER_RETRY, it's necessary to release the timer's lock - * and try again. (This happens when the timer is in the middle of firing.) - */ -static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, - struct itimerspec *new, struct itimerspec *old) -{ - unsigned long flags; - struct sighand_struct *sighand; - struct task_struct *p = timer->it.cpu.task; - unsigned long long old_expires, new_expires, old_incr, val; - int ret; - - WARN_ON_ONCE(p == NULL); - - new_expires = timespec_to_sample(timer->it_clock, &new->it_value); - - /* - * Protect against sighand release/switch in exit/exec and p->cpu_timers - * and p->signal->cpu_timers read/write in arm_timer() - */ - sighand = lock_task_sighand(p, &flags); - /* - * If p has just been reaped, we can no - * longer get any information about it at all. - */ - if (unlikely(sighand == NULL)) { - return -ESRCH; - } - - /* - * Disarm any old timer after extracting its expiry time. - */ - WARN_ON_ONCE(!irqs_disabled()); - - ret = 0; - old_incr = timer->it.cpu.incr; - old_expires = timer->it.cpu.expires; - if (unlikely(timer->it.cpu.firing)) { - timer->it.cpu.firing = -1; - ret = TIMER_RETRY; - } else - list_del_init(&timer->it.cpu.entry); - - /* - * We need to sample the current value to convert the new - * value from to relative and absolute, and to convert the - * old value from absolute to relative. To set a process - * timer, we need a sample to balance the thread expiry - * times (in arm_timer). With an absolute time, we must - * check if it's already passed. In short, we need a sample. - */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(timer->it_clock, p, &val); - } else { - cpu_timer_sample_group(timer->it_clock, p, &val); - } - - if (old) { - if (old_expires == 0) { - old->it_value.tv_sec = 0; - old->it_value.tv_nsec = 0; - } else { - /* - * Update the timer in case it has - * overrun already. If it has, - * we'll report it as having overrun - * and with the next reloaded timer - * already ticking, though we are - * swallowing that pending - * notification here to install the - * new setting. - */ - bump_cpu_timer(timer, val); - if (val < timer->it.cpu.expires) { - old_expires = timer->it.cpu.expires - val; - sample_to_timespec(timer->it_clock, - old_expires, - &old->it_value); - } else { - old->it_value.tv_nsec = 1; - old->it_value.tv_sec = 0; - } - } - } - - if (unlikely(ret)) { - /* - * We are colliding with the timer actually firing. - * Punt after filling in the timer's old value, and - * disable this firing since we are already reporting - * it as an overrun (thanks to bump_cpu_timer above). - */ - unlock_task_sighand(p, &flags); - goto out; - } - - if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) { - new_expires += val; - } - - /* - * Install the new expiry time (or zero). - * For a timer with no notification action, we don't actually - * arm the timer (we'll just fake it for timer_gettime). - */ - timer->it.cpu.expires = new_expires; - if (new_expires != 0 && val < new_expires) { - arm_timer(timer); - } - - unlock_task_sighand(p, &flags); - /* - * Install the new reload setting, and - * set up the signal and overrun bookkeeping. - */ - timer->it.cpu.incr = timespec_to_sample(timer->it_clock, - &new->it_interval); - - /* - * This acts as a modification timestamp for the timer, - * so any automatic reload attempt will punt on seeing - * that we have reset the timer manually. - */ - timer->it_requeue_pending = (timer->it_requeue_pending + 2) & - ~REQUEUE_PENDING; - timer->it_overrun_last = 0; - timer->it_overrun = -1; - - if (new_expires != 0 && !(val < new_expires)) { - /* - * The designated time already passed, so we notify - * immediately, even if the thread never runs to - * accumulate more time on this clock. - */ - cpu_timer_fire(timer); - } - - ret = 0; - out: - if (old) { - sample_to_timespec(timer->it_clock, - old_incr, &old->it_interval); - } - if (!ret) - posix_cpu_timer_kick_nohz(); - return ret; -} - -static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) -{ - unsigned long long now; - struct task_struct *p = timer->it.cpu.task; - - WARN_ON_ONCE(p == NULL); - - /* - * Easy part: convert the reload time. - */ - sample_to_timespec(timer->it_clock, - timer->it.cpu.incr, &itp->it_interval); - - if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ - itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; - return; - } - - /* - * Sample the clock to take the difference with the expiry time. - */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(timer->it_clock, p, &now); - } else { - struct sighand_struct *sighand; - unsigned long flags; - - /* - * Protect against sighand release/switch in exit/exec and - * also make timer sampling safe if it ends up calling - * thread_group_cputime(). - */ - sighand = lock_task_sighand(p, &flags); - if (unlikely(sighand == NULL)) { - /* - * The process has been reaped. - * We can't even collect a sample any more. - * Call the timer disarmed, nothing else to do. - */ - timer->it.cpu.expires = 0; - sample_to_timespec(timer->it_clock, timer->it.cpu.expires, - &itp->it_value); - } else { - cpu_timer_sample_group(timer->it_clock, p, &now); - unlock_task_sighand(p, &flags); - } - } - - if (now < timer->it.cpu.expires) { - sample_to_timespec(timer->it_clock, - timer->it.cpu.expires - now, - &itp->it_value); - } else { - /* - * The timer should have expired already, but the firing - * hasn't taken place yet. Say it's just about to expire. - */ - itp->it_value.tv_nsec = 1; - itp->it_value.tv_sec = 0; - } -} - -static unsigned long long -check_timers_list(struct list_head *timers, - struct list_head *firing, - unsigned long long curr) -{ - int maxfire = 20; - - while (!list_empty(timers)) { - struct cpu_timer_list *t; - - t = list_first_entry(timers, struct cpu_timer_list, entry); - - if (!--maxfire || curr < t->expires) - return t->expires; - - t->firing = 1; - list_move_tail(&t->entry, firing); - } - - return 0; -} - -/* - * Check for any per-thread CPU timers that have fired and move them off - * the tsk->cpu_timers[N] list onto the firing list. Here we update the - * tsk->it_*_expires values to reflect the remaining thread CPU timers. - */ -static void check_thread_timers(struct task_struct *tsk, - struct list_head *firing) -{ - struct list_head *timers = tsk->cpu_timers; - struct signal_struct *const sig = tsk->signal; - struct task_cputime *tsk_expires = &tsk->cputime_expires; - unsigned long long expires; - unsigned long soft; - - expires = check_timers_list(timers, firing, prof_ticks(tsk)); - tsk_expires->prof_exp = expires_to_cputime(expires); - - expires = check_timers_list(++timers, firing, virt_ticks(tsk)); - tsk_expires->virt_exp = expires_to_cputime(expires); - - tsk_expires->sched_exp = check_timers_list(++timers, firing, - tsk->se.sum_exec_runtime); - - /* - * Check for the special case thread timers. - */ - soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); - if (soft != RLIM_INFINITY) { - unsigned long hard = - ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); - - if (hard != RLIM_INFINITY && - tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { - /* - * At the hard limit, we just die. - * No need to calculate anything else now. - */ - __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); - return; - } - if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { - /* - * At the soft limit, send a SIGXCPU every second. - */ - if (soft < hard) { - soft += USEC_PER_SEC; - sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; - } - printk(KERN_INFO - "RT Watchdog Timeout: %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - } - } -} - -static void stop_process_timers(struct signal_struct *sig) -{ - struct thread_group_cputimer *cputimer = &sig->cputimer; - unsigned long flags; - - raw_spin_lock_irqsave(&cputimer->lock, flags); - cputimer->running = 0; - raw_spin_unlock_irqrestore(&cputimer->lock, flags); -} - -static u32 onecputick; - -static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, - unsigned long long *expires, - unsigned long long cur_time, int signo) -{ - if (!it->expires) - return; - - if (cur_time >= it->expires) { - if (it->incr) { - it->expires += it->incr; - it->error += it->incr_error; - if (it->error >= onecputick) { - it->expires -= cputime_one_jiffy; - it->error -= onecputick; - } - } else { - it->expires = 0; - } - - trace_itimer_expire(signo == SIGPROF ? - ITIMER_PROF : ITIMER_VIRTUAL, - tsk->signal->leader_pid, cur_time); - __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); - } - - if (it->expires && (!*expires || it->expires < *expires)) { - *expires = it->expires; - } -} - -/* - * Check for any per-thread CPU timers that have fired and move them - * off the tsk->*_timers list onto the firing list. Per-thread timers - * have already been taken off. - */ -static void check_process_timers(struct task_struct *tsk, - struct list_head *firing) -{ - struct signal_struct *const sig = tsk->signal; - unsigned long long utime, ptime, virt_expires, prof_expires; - unsigned long long sum_sched_runtime, sched_expires; - struct list_head *timers = sig->cpu_timers; - struct task_cputime cputime; - unsigned long soft; - - /* - * Collect the current process totals. - */ - thread_group_cputimer(tsk, &cputime); - utime = cputime_to_expires(cputime.utime); - ptime = utime + cputime_to_expires(cputime.stime); - sum_sched_runtime = cputime.sum_exec_runtime; - - prof_expires = check_timers_list(timers, firing, ptime); - virt_expires = check_timers_list(++timers, firing, utime); - sched_expires = check_timers_list(++timers, firing, sum_sched_runtime); - - /* - * Check for the special case process timers. - */ - check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, - SIGPROF); - check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, - SIGVTALRM); - soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); - if (soft != RLIM_INFINITY) { - unsigned long psecs = cputime_to_secs(ptime); - unsigned long hard = - ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); - cputime_t x; - if (psecs >= hard) { - /* - * At the hard limit, we just die. - * No need to calculate anything else now. - */ - __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); - return; - } - if (psecs >= soft) { - /* - * At the soft limit, send a SIGXCPU every second. - */ - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - if (soft < hard) { - soft++; - sig->rlim[RLIMIT_CPU].rlim_cur = soft; - } - } - x = secs_to_cputime(soft); - if (!prof_expires || x < prof_expires) { - prof_expires = x; - } - } - - sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires); - sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires); - sig->cputime_expires.sched_exp = sched_expires; - if (task_cputime_zero(&sig->cputime_expires)) - stop_process_timers(sig); -} - -/* - * This is called from the signal code (via do_schedule_next_timer) - * when the last timer signal was delivered and we have to reload the timer. - */ -void posix_cpu_timer_schedule(struct k_itimer *timer) -{ - struct sighand_struct *sighand; - unsigned long flags; - struct task_struct *p = timer->it.cpu.task; - unsigned long long now; - - WARN_ON_ONCE(p == NULL); - - /* - * Fetch the current sample and update the timer's expiry time. - */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(timer->it_clock, p, &now); - bump_cpu_timer(timer, now); - if (unlikely(p->exit_state)) - goto out; - - /* Protect timer list r/w in arm_timer() */ - sighand = lock_task_sighand(p, &flags); - if (!sighand) - goto out; - } else { - /* - * Protect arm_timer() and timer sampling in case of call to - * thread_group_cputime(). - */ - sighand = lock_task_sighand(p, &flags); - if (unlikely(sighand == NULL)) { - /* - * The process has been reaped. - * We can't even collect a sample any more. - */ - timer->it.cpu.expires = 0; - goto out; - } else if (unlikely(p->exit_state) && thread_group_empty(p)) { - unlock_task_sighand(p, &flags); - /* Optimizations: if the process is dying, no need to rearm */ - goto out; - } - cpu_timer_sample_group(timer->it_clock, p, &now); - bump_cpu_timer(timer, now); - /* Leave the sighand locked for the call below. */ - } - - /* - * Now re-arm for the new expiry time. - */ - WARN_ON_ONCE(!irqs_disabled()); - arm_timer(timer); - unlock_task_sighand(p, &flags); - - /* Kick full dynticks CPUs in case they need to tick on the new timer */ - posix_cpu_timer_kick_nohz(); -out: - timer->it_overrun_last = timer->it_overrun; - timer->it_overrun = -1; - ++timer->it_requeue_pending; -} - -/** - * task_cputime_expired - Compare two task_cputime entities. - * - * @sample: The task_cputime structure to be checked for expiration. - * @expires: Expiration times, against which @sample will be checked. - * - * Checks @sample against @expires to see if any field of @sample has expired. - * Returns true if any field of the former is greater than the corresponding - * field of the latter if the latter field is set. Otherwise returns false. - */ -static inline int task_cputime_expired(const struct task_cputime *sample, - const struct task_cputime *expires) -{ - if (expires->utime && sample->utime >= expires->utime) - return 1; - if (expires->stime && sample->utime + sample->stime >= expires->stime) - return 1; - if (expires->sum_exec_runtime != 0 && - sample->sum_exec_runtime >= expires->sum_exec_runtime) - return 1; - return 0; -} - -/** - * fastpath_timer_check - POSIX CPU timers fast path. - * - * @tsk: The task (thread) being checked. - * - * Check the task and thread group timers. If both are zero (there are no - * timers set) return false. Otherwise snapshot the task and thread group - * timers and compare them with the corresponding expiration times. Return - * true if a timer has expired, else return false. - */ -static inline int fastpath_timer_check(struct task_struct *tsk) -{ - struct signal_struct *sig; - cputime_t utime, stime; - - task_cputime(tsk, &utime, &stime); - - if (!task_cputime_zero(&tsk->cputime_expires)) { - struct task_cputime task_sample = { - .utime = utime, - .stime = stime, - .sum_exec_runtime = tsk->se.sum_exec_runtime - }; - - if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) - return 1; - } - - sig = tsk->signal; - if (sig->cputimer.running) { - struct task_cputime group_sample; - - raw_spin_lock(&sig->cputimer.lock); - group_sample = sig->cputimer.cputime; - raw_spin_unlock(&sig->cputimer.lock); - - if (task_cputime_expired(&group_sample, &sig->cputime_expires)) - return 1; - } - - return 0; -} - -/* - * This is called from the timer interrupt handler. The irq handler has - * already updated our counts. We need to check if any timers fire now. - * Interrupts are disabled. - */ -void run_posix_cpu_timers(struct task_struct *tsk) -{ - LIST_HEAD(firing); - struct k_itimer *timer, *next; - unsigned long flags; - - WARN_ON_ONCE(!irqs_disabled()); - - /* - * The fast path checks that there are no expired thread or thread - * group timers. If that's so, just return. - */ - if (!fastpath_timer_check(tsk)) - return; - - if (!lock_task_sighand(tsk, &flags)) - return; - /* - * Here we take off tsk->signal->cpu_timers[N] and - * tsk->cpu_timers[N] all the timers that are firing, and - * put them on the firing list. - */ - check_thread_timers(tsk, &firing); - /* - * If there are any active process wide timers (POSIX 1.b, itimers, - * RLIMIT_CPU) cputimer must be running. - */ - if (tsk->signal->cputimer.running) - check_process_timers(tsk, &firing); - - /* - * We must release these locks before taking any timer's lock. - * There is a potential race with timer deletion here, as the - * siglock now protects our private firing list. We have set - * the firing flag in each timer, so that a deletion attempt - * that gets the timer lock before we do will give it up and - * spin until we've taken care of that timer below. - */ - unlock_task_sighand(tsk, &flags); - - /* - * Now that all the timers on our list have the firing flag, - * no one will touch their list entries but us. We'll take - * each timer's lock before clearing its firing flag, so no - * timer call will interfere. - */ - list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { - int cpu_firing; - - spin_lock(&timer->it_lock); - list_del_init(&timer->it.cpu.entry); - cpu_firing = timer->it.cpu.firing; - timer->it.cpu.firing = 0; - /* - * The firing flag is -1 if we collided with a reset - * of the timer, which already reported this - * almost-firing as an overrun. So don't generate an event. - */ - if (likely(cpu_firing >= 0)) - cpu_timer_fire(timer); - spin_unlock(&timer->it_lock); - } -} - -/* - * Set one of the process-wide special case CPU timers or RLIMIT_CPU. - * The tsk->sighand->siglock must be held by the caller. - */ -void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, - cputime_t *newval, cputime_t *oldval) -{ - unsigned long long now; - - WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); - cpu_timer_sample_group(clock_idx, tsk, &now); - - if (oldval) { - /* - * We are setting itimer. The *oldval is absolute and we update - * it to be relative, *newval argument is relative and we update - * it to be absolute. - */ - if (*oldval) { - if (*oldval <= now) { - /* Just about to fire. */ - *oldval = cputime_one_jiffy; - } else { - *oldval -= now; - } - } - - if (!*newval) - goto out; - *newval += now; - } - - /* - * Update expiration cache if we are the earliest timer, or eventually - * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire. - */ - switch (clock_idx) { - case CPUCLOCK_PROF: - if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval)) - tsk->signal->cputime_expires.prof_exp = *newval; - break; - case CPUCLOCK_VIRT: - if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval)) - tsk->signal->cputime_expires.virt_exp = *newval; - break; - } -out: - posix_cpu_timer_kick_nohz(); -} - -static int do_cpu_nanosleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct itimerspec *it) -{ - struct k_itimer timer; - int error; - - /* - * Set up a temporary timer and then wait for it to go off. - */ - memset(&timer, 0, sizeof timer); - spin_lock_init(&timer.it_lock); - timer.it_clock = which_clock; - timer.it_overrun = -1; - error = posix_cpu_timer_create(&timer); - timer.it_process = current; - if (!error) { - static struct itimerspec zero_it; - - memset(it, 0, sizeof *it); - it->it_value = *rqtp; - - spin_lock_irq(&timer.it_lock); - error = posix_cpu_timer_set(&timer, flags, it, NULL); - if (error) { - spin_unlock_irq(&timer.it_lock); - return error; - } - - while (!signal_pending(current)) { - if (timer.it.cpu.expires == 0) { - /* - * Our timer fired and was reset, below - * deletion can not fail. - */ - posix_cpu_timer_del(&timer); - spin_unlock_irq(&timer.it_lock); - return 0; - } - - /* - * Block until cpu_timer_fire (or a signal) wakes us. - */ - __set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irq(&timer.it_lock); - schedule(); - spin_lock_irq(&timer.it_lock); - } - - /* - * We were interrupted by a signal. - */ - sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); - error = posix_cpu_timer_set(&timer, 0, &zero_it, it); - if (!error) { - /* - * Timer is now unarmed, deletion can not fail. - */ - posix_cpu_timer_del(&timer); - } - spin_unlock_irq(&timer.it_lock); - - while (error == TIMER_RETRY) { - /* - * We need to handle case when timer was or is in the - * middle of firing. In other cases we already freed - * resources. - */ - spin_lock_irq(&timer.it_lock); - error = posix_cpu_timer_del(&timer); - spin_unlock_irq(&timer.it_lock); - } - - if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { - /* - * It actually did fire already. - */ - return 0; - } - - error = -ERESTART_RESTARTBLOCK; - } - - return error; -} - -static long posix_cpu_nsleep_restart(struct restart_block *restart_block); - -static int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) -{ - struct restart_block *restart_block = - ¤t_thread_info()->restart_block; - struct itimerspec it; - int error; - - /* - * Diagnose required errors first. - */ - if (CPUCLOCK_PERTHREAD(which_clock) && - (CPUCLOCK_PID(which_clock) == 0 || - CPUCLOCK_PID(which_clock) == current->pid)) - return -EINVAL; - - error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); - - if (error == -ERESTART_RESTARTBLOCK) { - - if (flags & TIMER_ABSTIME) - return -ERESTARTNOHAND; - /* - * Report back to the user the time still remaining. - */ - if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) - return -EFAULT; - - restart_block->fn = posix_cpu_nsleep_restart; - restart_block->nanosleep.clockid = which_clock; - restart_block->nanosleep.rmtp = rmtp; - restart_block->nanosleep.expires = timespec_to_ns(rqtp); - } - return error; -} - -static long posix_cpu_nsleep_restart(struct restart_block *restart_block) -{ - clockid_t which_clock = restart_block->nanosleep.clockid; - struct timespec t; - struct itimerspec it; - int error; - - t = ns_to_timespec(restart_block->nanosleep.expires); - - error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); - - if (error == -ERESTART_RESTARTBLOCK) { - struct timespec __user *rmtp = restart_block->nanosleep.rmtp; - /* - * Report back to the user the time still remaining. - */ - if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) - return -EFAULT; - - restart_block->nanosleep.expires = timespec_to_ns(&t); - } - return error; - -} - -#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) -#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) - -static int process_cpu_clock_getres(const clockid_t which_clock, - struct timespec *tp) -{ - return posix_cpu_clock_getres(PROCESS_CLOCK, tp); -} -static int process_cpu_clock_get(const clockid_t which_clock, - struct timespec *tp) -{ - return posix_cpu_clock_get(PROCESS_CLOCK, tp); -} -static int process_cpu_timer_create(struct k_itimer *timer) -{ - timer->it_clock = PROCESS_CLOCK; - return posix_cpu_timer_create(timer); -} -static int process_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, - struct timespec __user *rmtp) -{ - return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); -} -static long process_cpu_nsleep_restart(struct restart_block *restart_block) -{ - return -EINVAL; -} -static int thread_cpu_clock_getres(const clockid_t which_clock, - struct timespec *tp) -{ - return posix_cpu_clock_getres(THREAD_CLOCK, tp); -} -static int thread_cpu_clock_get(const clockid_t which_clock, - struct timespec *tp) -{ - return posix_cpu_clock_get(THREAD_CLOCK, tp); -} -static int thread_cpu_timer_create(struct k_itimer *timer) -{ - timer->it_clock = THREAD_CLOCK; - return posix_cpu_timer_create(timer); -} - -struct k_clock clock_posix_cpu = { - .clock_getres = posix_cpu_clock_getres, - .clock_set = posix_cpu_clock_set, - .clock_get = posix_cpu_clock_get, - .timer_create = posix_cpu_timer_create, - .nsleep = posix_cpu_nsleep, - .nsleep_restart = posix_cpu_nsleep_restart, - .timer_set = posix_cpu_timer_set, - .timer_del = posix_cpu_timer_del, - .timer_get = posix_cpu_timer_get, -}; - -static __init int init_posix_cpu_timers(void) -{ - struct k_clock process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, - .nsleep_restart = process_cpu_nsleep_restart, - }; - struct k_clock thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .timer_create = thread_cpu_timer_create, - }; - struct timespec ts; - - posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); - posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); - - cputime_to_timespec(cputime_one_jiffy, &ts); - onecputick = ts.tv_nsec; - WARN_ON(ts.tv_sec != 0); - - return 0; -} -__initcall(init_posix_cpu_timers); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c deleted file mode 100644 index 424c2d4..0000000 --- a/kernel/posix-timers.c +++ /dev/null @@ -1,1121 +0,0 @@ -/* - * linux/kernel/posix-timers.c - * - * - * 2002-10-15 Posix Clocks & timers - * by George Anzinger george@mvista.com - * - * Copyright (C) 2002 2003 by MontaVista Software. - * - * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug. - * Copyright (C) 2004 Boris Hu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA - */ - -/* These are all the functions necessary to implement - * POSIX clocks & timers - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Management arrays for POSIX timers. Timers are now kept in static hash table - * with 512 entries. - * Timer ids are allocated by local routine, which selects proper hash head by - * key, constructed from current->signal address and per signal struct counter. - * This keeps timer ids unique per process, but now they can intersect between - * processes. - */ - -/* - * Lets keep our timers in a slab cache :-) - */ -static struct kmem_cache *posix_timers_cache; - -static DEFINE_HASHTABLE(posix_timers_hashtable, 9); -static DEFINE_SPINLOCK(hash_lock); - -/* - * we assume that the new SIGEV_THREAD_ID shares no bits with the other - * SIGEV values. Here we put out an error if this assumption fails. - */ -#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ - ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) -#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" -#endif - -/* - * parisc wants ENOTSUP instead of EOPNOTSUPP - */ -#ifndef ENOTSUP -# define ENANOSLEEP_NOTSUP EOPNOTSUPP -#else -# define ENANOSLEEP_NOTSUP ENOTSUP -#endif - -/* - * The timer ID is turned into a timer address by idr_find(). - * Verifying a valid ID consists of: - * - * a) checking that idr_find() returns other than -1. - * b) checking that the timer id matches the one in the timer itself. - * c) that the timer owner is in the callers thread group. - */ - -/* - * CLOCKs: The POSIX standard calls for a couple of clocks and allows us - * to implement others. This structure defines the various - * clocks. - * - * RESOLUTION: Clock resolution is used to round up timer and interval - * times, NOT to report clock times, which are reported with as - * much resolution as the system can muster. In some cases this - * resolution may depend on the underlying clock hardware and - * may not be quantifiable until run time, and only then is the - * necessary code is written. The standard says we should say - * something about this issue in the documentation... - * - * FUNCTIONS: The CLOCKs structure defines possible functions to - * handle various clock functions. - * - * The standard POSIX timer management code assumes the - * following: 1.) The k_itimer struct (sched.h) is used for - * the timer. 2.) The list, it_lock, it_clock, it_id and - * it_pid fields are not modified by timer code. - * - * Permissions: It is assumed that the clock_settime() function defined - * for each clock will take care of permission checks. Some - * clocks may be set able by any user (i.e. local process - * clocks) others not. Currently the only set able clock we - * have is CLOCK_REALTIME and its high res counter part, both of - * which we beg off on and pass to do_sys_settimeofday(). - */ - -static struct k_clock posix_clocks[MAX_CLOCKS]; - -/* - * These ones are defined below. - */ -static int common_nsleep(const clockid_t, int flags, struct timespec *t, - struct timespec __user *rmtp); -static int common_timer_create(struct k_itimer *new_timer); -static void common_timer_get(struct k_itimer *, struct itimerspec *); -static int common_timer_set(struct k_itimer *, int, - struct itimerspec *, struct itimerspec *); -static int common_timer_del(struct k_itimer *timer); - -static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); - -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); - -#define lock_timer(tid, flags) \ -({ struct k_itimer *__timr; \ - __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ - __timr; \ -}) - -static int hash(struct signal_struct *sig, unsigned int nr) -{ - return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); -} - -static struct k_itimer *__posix_timers_find(struct hlist_head *head, - struct signal_struct *sig, - timer_t id) -{ - struct k_itimer *timer; - - hlist_for_each_entry_rcu(timer, head, t_hash) { - if ((timer->it_signal == sig) && (timer->it_id == id)) - return timer; - } - return NULL; -} - -static struct k_itimer *posix_timer_by_id(timer_t id) -{ - struct signal_struct *sig = current->signal; - struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; - - return __posix_timers_find(head, sig, id); -} - -static int posix_timer_add(struct k_itimer *timer) -{ - struct signal_struct *sig = current->signal; - int first_free_id = sig->posix_timer_id; - struct hlist_head *head; - int ret = -ENOENT; - - do { - spin_lock(&hash_lock); - head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)]; - if (!__posix_timers_find(head, sig, sig->posix_timer_id)) { - hlist_add_head_rcu(&timer->t_hash, head); - ret = sig->posix_timer_id; - } - if (++sig->posix_timer_id < 0) - sig->posix_timer_id = 0; - if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT)) - /* Loop over all possible ids completed */ - ret = -EAGAIN; - spin_unlock(&hash_lock); - } while (ret == -ENOENT); - return ret; -} - -static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) -{ - spin_unlock_irqrestore(&timr->it_lock, flags); -} - -/* Get clock_realtime */ -static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) -{ - ktime_get_real_ts(tp); - return 0; -} - -/* Set clock_realtime */ -static int posix_clock_realtime_set(const clockid_t which_clock, - const struct timespec *tp) -{ - return do_sys_settimeofday(tp, NULL); -} - -static int posix_clock_realtime_adj(const clockid_t which_clock, - struct timex *t) -{ - return do_adjtimex(t); -} - -/* - * Get monotonic time for posix timers - */ -static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) -{ - ktime_get_ts(tp); - return 0; -} - -/* - * Get monotonic-raw time for posix timers - */ -static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) -{ - getrawmonotonic(tp); - return 0; -} - - -static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) -{ - *tp = current_kernel_time(); - return 0; -} - -static int posix_get_monotonic_coarse(clockid_t which_clock, - struct timespec *tp) -{ - *tp = get_monotonic_coarse(); - return 0; -} - -static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) -{ - *tp = ktime_to_timespec(KTIME_LOW_RES); - return 0; -} - -static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) -{ - get_monotonic_boottime(tp); - return 0; -} - -static int posix_get_tai(clockid_t which_clock, struct timespec *tp) -{ - timekeeping_clocktai(tp); - return 0; -} - -/* - * Initialize everything, well, just everything in Posix clocks/timers ;) - */ -static __init int init_posix_timers(void) -{ - struct k_clock clock_realtime = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_clock_realtime_get, - .clock_set = posix_clock_realtime_set, - .clock_adj = posix_clock_realtime_adj, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_monotonic = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_ktime_get_ts, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_monotonic_raw = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_get_monotonic_raw, - }; - struct k_clock clock_realtime_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, - }; - struct k_clock clock_monotonic_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, - }; - struct k_clock clock_tai = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_get_tai, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_boottime = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_get_boottime, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - - posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); - posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); - posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); - posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); - posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); - posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); - posix_timers_register_clock(CLOCK_TAI, &clock_tai); - - posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, SLAB_PANIC, - NULL); - return 0; -} - -__initcall(init_posix_timers); - -static void schedule_next_timer(struct k_itimer *timr) -{ - struct hrtimer *timer = &timr->it.real.timer; - - if (timr->it.real.interval.tv64 == 0) - return; - - timr->it_overrun += (unsigned int) hrtimer_forward(timer, - timer->base->get_time(), - timr->it.real.interval); - - timr->it_overrun_last = timr->it_overrun; - timr->it_overrun = -1; - ++timr->it_requeue_pending; - hrtimer_restart(timer); -} - -/* - * This function is exported for use by the signal deliver code. It is - * called just prior to the info block being released and passes that - * block to us. It's function is to update the overrun entry AND to - * restart the timer. It should only be called if the timer is to be - * restarted (i.e. we have flagged this in the sys_private entry of the - * info block). - * - * To protect against the timer going away while the interrupt is queued, - * we require that the it_requeue_pending flag be set. - */ -void do_schedule_next_timer(struct siginfo *info) -{ - struct k_itimer *timr; - unsigned long flags; - - timr = lock_timer(info->si_tid, &flags); - - if (timr && timr->it_requeue_pending == info->si_sys_private) { - if (timr->it_clock < 0) - posix_cpu_timer_schedule(timr); - else - schedule_next_timer(timr); - - info->si_overrun += timr->it_overrun_last; - } - - if (timr) - unlock_timer(timr, flags); -} - -int posix_timer_event(struct k_itimer *timr, int si_private) -{ - struct task_struct *task; - int shared, ret = -1; - /* - * FIXME: if ->sigq is queued we can race with - * dequeue_signal()->do_schedule_next_timer(). - * - * If dequeue_signal() sees the "right" value of - * si_sys_private it calls do_schedule_next_timer(). - * We re-queue ->sigq and drop ->it_lock(). - * do_schedule_next_timer() locks the timer - * and re-schedules it while ->sigq is pending. - * Not really bad, but not that we want. - */ - timr->sigq->info.si_sys_private = si_private; - - rcu_read_lock(); - task = pid_task(timr->it_pid, PIDTYPE_PID); - if (task) { - shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); - ret = send_sigqueue(timr->sigq, task, shared); - } - rcu_read_unlock(); - /* If we failed to send the signal the timer stops. */ - return ret > 0; -} -EXPORT_SYMBOL_GPL(posix_timer_event); - -/* - * This function gets called when a POSIX.1b interval timer expires. It - * is used as a callback from the kernel internal timer. The - * run_timer_list code ALWAYS calls with interrupts on. - - * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. - */ -static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) -{ - struct k_itimer *timr; - unsigned long flags; - int si_private = 0; - enum hrtimer_restart ret = HRTIMER_NORESTART; - - timr = container_of(timer, struct k_itimer, it.real.timer); - spin_lock_irqsave(&timr->it_lock, flags); - - if (timr->it.real.interval.tv64 != 0) - si_private = ++timr->it_requeue_pending; - - if (posix_timer_event(timr, si_private)) { - /* - * signal was not sent because of sig_ignor - * we will not get a call back to restart it AND - * it should be restarted. - */ - if (timr->it.real.interval.tv64 != 0) { - ktime_t now = hrtimer_cb_get_time(timer); - - /* - * FIXME: What we really want, is to stop this - * timer completely and restart it in case the - * SIG_IGN is removed. This is a non trivial - * change which involves sighand locking - * (sigh !), which we don't want to do late in - * the release cycle. - * - * For now we just let timers with an interval - * less than a jiffie expire every jiffie to - * avoid softirq starvation in case of SIG_IGN - * and a very small interval, which would put - * the timer right back on the softirq pending - * list. By moving now ahead of time we trick - * hrtimer_forward() to expire the timer - * later, while we still maintain the overrun - * accuracy, but have some inconsistency in - * the timer_gettime() case. This is at least - * better than a starved softirq. A more - * complex fix which solves also another related - * inconsistency is already in the pipeline. - */ -#ifdef CONFIG_HIGH_RES_TIMERS - { - ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ); - - if (timr->it.real.interval.tv64 < kj.tv64) - now = ktime_add(now, kj); - } -#endif - timr->it_overrun += (unsigned int) - hrtimer_forward(timer, now, - timr->it.real.interval); - ret = HRTIMER_RESTART; - ++timr->it_requeue_pending; - } - } - - unlock_timer(timr, flags); - return ret; -} - -static struct pid *good_sigevent(sigevent_t * event) -{ - struct task_struct *rtn = current->group_leader; - - if ((event->sigev_notify & SIGEV_THREAD_ID ) && - (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || - !same_thread_group(rtn, current) || - (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) - return NULL; - - if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && - ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) - return NULL; - - return task_pid(rtn); -} - -void posix_timers_register_clock(const clockid_t clock_id, - struct k_clock *new_clock) -{ - if ((unsigned) clock_id >= MAX_CLOCKS) { - printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", - clock_id); - return; - } - - if (!new_clock->clock_get) { - printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", - clock_id); - return; - } - if (!new_clock->clock_getres) { - printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", - clock_id); - return; - } - - posix_clocks[clock_id] = *new_clock; -} -EXPORT_SYMBOL_GPL(posix_timers_register_clock); - -static struct k_itimer * alloc_posix_timer(void) -{ - struct k_itimer *tmr; - tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); - if (!tmr) - return tmr; - if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { - kmem_cache_free(posix_timers_cache, tmr); - return NULL; - } - memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); - return tmr; -} - -static void k_itimer_rcu_free(struct rcu_head *head) -{ - struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); - - kmem_cache_free(posix_timers_cache, tmr); -} - -#define IT_ID_SET 1 -#define IT_ID_NOT_SET 0 -static void release_posix_timer(struct k_itimer *tmr, int it_id_set) -{ - if (it_id_set) { - unsigned long flags; - spin_lock_irqsave(&hash_lock, flags); - hlist_del_rcu(&tmr->t_hash); - spin_unlock_irqrestore(&hash_lock, flags); - } - put_pid(tmr->it_pid); - sigqueue_free(tmr->sigq); - call_rcu(&tmr->it.rcu, k_itimer_rcu_free); -} - -static struct k_clock *clockid_to_kclock(const clockid_t id) -{ - if (id < 0) - return (id & CLOCKFD_MASK) == CLOCKFD ? - &clock_posix_dynamic : &clock_posix_cpu; - - if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) - return NULL; - return &posix_clocks[id]; -} - -static int common_timer_create(struct k_itimer *new_timer) -{ - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); - return 0; -} - -/* Create a POSIX.1b interval timer. */ - -SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, - struct sigevent __user *, timer_event_spec, - timer_t __user *, created_timer_id) -{ - struct k_clock *kc = clockid_to_kclock(which_clock); - struct k_itimer *new_timer; - int error, new_timer_id; - sigevent_t event; - int it_id_set = IT_ID_NOT_SET; - - if (!kc) - return -EINVAL; - if (!kc->timer_create) - return -EOPNOTSUPP; - - new_timer = alloc_posix_timer(); - if (unlikely(!new_timer)) - return -EAGAIN; - - spin_lock_init(&new_timer->it_lock); - new_timer_id = posix_timer_add(new_timer); - if (new_timer_id < 0) { - error = new_timer_id; - goto out; - } - - it_id_set = IT_ID_SET; - new_timer->it_id = (timer_t) new_timer_id; - new_timer->it_clock = which_clock; - new_timer->it_overrun = -1; - - if (timer_event_spec) { - if (copy_from_user(&event, timer_event_spec, sizeof (event))) { - error = -EFAULT; - goto out; - } - rcu_read_lock(); - new_timer->it_pid = get_pid(good_sigevent(&event)); - rcu_read_unlock(); - if (!new_timer->it_pid) { - error = -EINVAL; - goto out; - } - } else { - event.sigev_notify = SIGEV_SIGNAL; - event.sigev_signo = SIGALRM; - event.sigev_value.sival_int = new_timer->it_id; - new_timer->it_pid = get_pid(task_tgid(current)); - } - - new_timer->it_sigev_notify = event.sigev_notify; - new_timer->sigq->info.si_signo = event.sigev_signo; - new_timer->sigq->info.si_value = event.sigev_value; - new_timer->sigq->info.si_tid = new_timer->it_id; - new_timer->sigq->info.si_code = SI_TIMER; - - if (copy_to_user(created_timer_id, - &new_timer_id, sizeof (new_timer_id))) { - error = -EFAULT; - goto out; - } - - error = kc->timer_create(new_timer); - if (error) - goto out; - - spin_lock_irq(¤t->sighand->siglock); - new_timer->it_signal = current->signal; - list_add(&new_timer->list, ¤t->signal->posix_timers); - spin_unlock_irq(¤t->sighand->siglock); - - return 0; - /* - * In the case of the timer belonging to another task, after - * the task is unlocked, the timer is owned by the other task - * and may cease to exist at any time. Don't use or modify - * new_timer after the unlock call. - */ -out: - release_posix_timer(new_timer, it_id_set); - return error; -} - -/* - * Locking issues: We need to protect the result of the id look up until - * we get the timer locked down so it is not deleted under us. The - * removal is done under the idr spinlock so we use that here to bridge - * the find to the timer lock. To avoid a dead lock, the timer id MUST - * be release with out holding the timer lock. - */ -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) -{ - struct k_itimer *timr; - - /* - * timer_t could be any type >= int and we want to make sure any - * @timer_id outside positive int range fails lookup. - */ - if ((unsigned long long)timer_id > INT_MAX) - return NULL; - - rcu_read_lock(); - timr = posix_timer_by_id(timer_id); - if (timr) { - spin_lock_irqsave(&timr->it_lock, *flags); - if (timr->it_signal == current->signal) { - rcu_read_unlock(); - return timr; - } - spin_unlock_irqrestore(&timr->it_lock, *flags); - } - rcu_read_unlock(); - - return NULL; -} - -/* - * Get the time remaining on a POSIX.1b interval timer. This function - * is ALWAYS called with spin_lock_irq on the timer, thus it must not - * mess with irq. - * - * We have a couple of messes to clean up here. First there is the case - * of a timer that has a requeue pending. These timers should appear to - * be in the timer list with an expiry as if we were to requeue them - * now. - * - * The second issue is the SIGEV_NONE timer which may be active but is - * not really ever put in the timer list (to save system resources). - * This timer may be expired, and if so, we will do it here. Otherwise - * it is the same as a requeue pending timer WRT to what we should - * report. - */ -static void -common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) -{ - ktime_t now, remaining, iv; - struct hrtimer *timer = &timr->it.real.timer; - - memset(cur_setting, 0, sizeof(struct itimerspec)); - - iv = timr->it.real.interval; - - /* interval timer ? */ - if (iv.tv64) - cur_setting->it_interval = ktime_to_timespec(iv); - else if (!hrtimer_active(timer) && - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) - return; - - now = timer->base->get_time(); - - /* - * When a requeue is pending or this is a SIGEV_NONE - * timer move the expiry time forward by intervals, so - * expiry is > now. - */ - if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) - timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); - - remaining = ktime_sub(hrtimer_get_expires(timer), now); - /* Return 0 only, when the timer is expired and not pending */ - if (remaining.tv64 <= 0) { - /* - * A single shot SIGEV_NONE timer must return 0, when - * it is expired ! - */ - if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) - cur_setting->it_value.tv_nsec = 1; - } else - cur_setting->it_value = ktime_to_timespec(remaining); -} - -/* Get the time remaining on a POSIX.1b interval timer. */ -SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct itimerspec __user *, setting) -{ - struct itimerspec cur_setting; - struct k_itimer *timr; - struct k_clock *kc; - unsigned long flags; - int ret = 0; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - - kc = clockid_to_kclock(timr->it_clock); - if (WARN_ON_ONCE(!kc || !kc->timer_get)) - ret = -EINVAL; - else - kc->timer_get(timr, &cur_setting); - - unlock_timer(timr, flags); - - if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) - return -EFAULT; - - return ret; -} - -/* - * Get the number of overruns of a POSIX.1b interval timer. This is to - * be the overrun of the timer last delivered. At the same time we are - * accumulating overruns on the next timer. The overrun is frozen when - * the signal is delivered, either at the notify time (if the info block - * is not queued) or at the actual delivery time (as we are informed by - * the call back to do_schedule_next_timer(). So all we need to do is - * to pick up the frozen overrun. - */ -SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) -{ - struct k_itimer *timr; - int overrun; - unsigned long flags; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - - overrun = timr->it_overrun_last; - unlock_timer(timr, flags); - - return overrun; -} - -/* Set a POSIX.1b interval timer. */ -/* timr->it_lock is taken. */ -static int -common_timer_set(struct k_itimer *timr, int flags, - struct itimerspec *new_setting, struct itimerspec *old_setting) -{ - struct hrtimer *timer = &timr->it.real.timer; - enum hrtimer_mode mode; - - if (old_setting) - common_timer_get(timr, old_setting); - - /* disable the timer */ - timr->it.real.interval.tv64 = 0; - /* - * careful here. If smp we could be in the "fire" routine which will - * be spinning as we hold the lock. But this is ONLY an SMP issue. - */ - if (hrtimer_try_to_cancel(timer) < 0) - return TIMER_RETRY; - - timr->it_requeue_pending = (timr->it_requeue_pending + 2) & - ~REQUEUE_PENDING; - timr->it_overrun_last = 0; - - /* switch off the timer when it_value is zero */ - if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) - return 0; - - mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; - hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.function = posix_timer_fn; - - hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value)); - - /* Convert interval */ - timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); - - /* SIGEV_NONE timers are not queued ! See common_timer_get */ - if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { - /* Setup correct expiry time for relative timers */ - if (mode == HRTIMER_MODE_REL) { - hrtimer_add_expires(timer, timer->base->get_time()); - } - return 0; - } - - hrtimer_start_expires(timer, mode); - return 0; -} - -/* Set a POSIX.1b interval timer */ -SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - const struct itimerspec __user *, new_setting, - struct itimerspec __user *, old_setting) -{ - struct k_itimer *timr; - struct itimerspec new_spec, old_spec; - int error = 0; - unsigned long flag; - struct itimerspec *rtn = old_setting ? &old_spec : NULL; - struct k_clock *kc; - - if (!new_setting) - return -EINVAL; - - if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) - return -EFAULT; - - if (!timespec_valid(&new_spec.it_interval) || - !timespec_valid(&new_spec.it_value)) - return -EINVAL; -retry: - timr = lock_timer(timer_id, &flag); - if (!timr) - return -EINVAL; - - kc = clockid_to_kclock(timr->it_clock); - if (WARN_ON_ONCE(!kc || !kc->timer_set)) - error = -EINVAL; - else - error = kc->timer_set(timr, flags, &new_spec, rtn); - - unlock_timer(timr, flag); - if (error == TIMER_RETRY) { - rtn = NULL; // We already got the old time... - goto retry; - } - - if (old_setting && !error && - copy_to_user(old_setting, &old_spec, sizeof (old_spec))) - error = -EFAULT; - - return error; -} - -static int common_timer_del(struct k_itimer *timer) -{ - timer->it.real.interval.tv64 = 0; - - if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) - return TIMER_RETRY; - return 0; -} - -static inline int timer_delete_hook(struct k_itimer *timer) -{ - struct k_clock *kc = clockid_to_kclock(timer->it_clock); - - if (WARN_ON_ONCE(!kc || !kc->timer_del)) - return -EINVAL; - return kc->timer_del(timer); -} - -/* Delete a POSIX.1b interval timer. */ -SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) -{ - struct k_itimer *timer; - unsigned long flags; - -retry_delete: - timer = lock_timer(timer_id, &flags); - if (!timer) - return -EINVAL; - - if (timer_delete_hook(timer) == TIMER_RETRY) { - unlock_timer(timer, flags); - goto retry_delete; - } - - spin_lock(¤t->sighand->siglock); - list_del(&timer->list); - spin_unlock(¤t->sighand->siglock); - /* - * This keeps any tasks waiting on the spin lock from thinking - * they got something (see the lock code above). - */ - timer->it_signal = NULL; - - unlock_timer(timer, flags); - release_posix_timer(timer, IT_ID_SET); - return 0; -} - -/* - * return timer owned by the process, used by exit_itimers - */ -static void itimer_delete(struct k_itimer *timer) -{ - unsigned long flags; - -retry_delete: - spin_lock_irqsave(&timer->it_lock, flags); - - if (timer_delete_hook(timer) == TIMER_RETRY) { - unlock_timer(timer, flags); - goto retry_delete; - } - list_del(&timer->list); - /* - * This keeps any tasks waiting on the spin lock from thinking - * they got something (see the lock code above). - */ - timer->it_signal = NULL; - - unlock_timer(timer, flags); - release_posix_timer(timer, IT_ID_SET); -} - -/* - * This is called by do_exit or de_thread, only when there are no more - * references to the shared signal_struct. - */ -void exit_itimers(struct signal_struct *sig) -{ - struct k_itimer *tmr; - - while (!list_empty(&sig->posix_timers)) { - tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); - itimer_delete(tmr); - } -} - -SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - const struct timespec __user *, tp) -{ - struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec new_tp; - - if (!kc || !kc->clock_set) - return -EINVAL; - - if (copy_from_user(&new_tp, tp, sizeof (*tp))) - return -EFAULT; - - return kc->clock_set(which_clock, &new_tp); -} - -SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct timespec __user *,tp) -{ - struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec kernel_tp; - int error; - - if (!kc) - return -EINVAL; - - error = kc->clock_get(which_clock, &kernel_tp); - - if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) - error = -EFAULT; - - return error; -} - -SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, - struct timex __user *, utx) -{ - struct k_clock *kc = clockid_to_kclock(which_clock); - struct timex ktx; - int err; - - if (!kc) - return -EINVAL; - if (!kc->clock_adj) - return -EOPNOTSUPP; - - if (copy_from_user(&ktx, utx, sizeof(ktx))) - return -EFAULT; - - err = kc->clock_adj(which_clock, &ktx); - - if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx))) - return -EFAULT; - - return err; -} - -SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, - struct timespec __user *, tp) -{ - struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec rtn_tp; - int error; - - if (!kc) - return -EINVAL; - - error = kc->clock_getres(which_clock, &rtn_tp); - - if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) - error = -EFAULT; - - return error; -} - -/* - * nanosleep for monotonic and realtime clocks - */ -static int common_nsleep(const clockid_t which_clock, int flags, - struct timespec *tsave, struct timespec __user *rmtp) -{ - return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? - HRTIMER_MODE_ABS : HRTIMER_MODE_REL, - which_clock); -} - -SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, - const struct timespec __user *, rqtp, - struct timespec __user *, rmtp) -{ - struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec t; - - if (!kc) - return -EINVAL; - if (!kc->nsleep) - return -ENANOSLEEP_NOTSUP; - - if (copy_from_user(&t, rqtp, sizeof (struct timespec))) - return -EFAULT; - - if (!timespec_valid(&t)) - return -EINVAL; - - return kc->nsleep(which_clock, flags, &t, rmtp); -} - -/* - * This will restart clock_nanosleep. This is required only by - * compat_clock_nanosleep_restart for now. - */ -long clock_nanosleep_restart(struct restart_block *restart_block) -{ - clockid_t which_clock = restart_block->nanosleep.clockid; - struct k_clock *kc = clockid_to_kclock(which_clock); - - if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) - return -EINVAL; - - return kc->nsleep_restart(restart_block); -} diff --git a/kernel/time.c b/kernel/time.c deleted file mode 100644 index 7c7964c..0000000 --- a/kernel/time.c +++ /dev/null @@ -1,714 +0,0 @@ -/* - * linux/kernel/time.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file contains the interface functions for the various - * time related system calls: time, stime, gettimeofday, settimeofday, - * adjtime - */ -/* - * Modification history kernel/time.c - * - * 1993-09-02 Philip Gladstone - * Created file with time related functions from sched/core.c and adjtimex() - * 1993-10-08 Torsten Duwe - * adjtime interface update and CMOS clock write code - * 1995-08-13 Torsten Duwe - * kernel PLL updated to 1994-12-13 specs (rfc-1589) - * 1999-01-16 Ulrich Windl - * Introduced error checking for many cases in adjtimex(). - * Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) - * (Even though the technical memorandum forbids it) - * 2004-07-14 Christoph Lameter - * Added getnstimeofday to allow the posix timer functions to return - * with nanosecond accuracy - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "timeconst.h" - -/* - * The timezone where the local system is located. Used as a default by some - * programs who obtain this value by using gettimeofday. - */ -struct timezone sys_tz; - -EXPORT_SYMBOL(sys_tz); - -#ifdef __ARCH_WANT_SYS_TIME - -/* - * sys_time() can be implemented in user-level using - * sys_gettimeofday(). Is this for backwards compatibility? If so, - * why not move it into the appropriate arch directory (for those - * architectures that need it). - */ -SYSCALL_DEFINE1(time, time_t __user *, tloc) -{ - time_t i = get_seconds(); - - if (tloc) { - if (put_user(i,tloc)) - return -EFAULT; - } - force_successful_syscall_return(); - return i; -} - -/* - * sys_stime() can be implemented in user-level using - * sys_settimeofday(). Is this for backwards compatibility? If so, - * why not move it into the appropriate arch directory (for those - * architectures that need it). - */ - -SYSCALL_DEFINE1(stime, time_t __user *, tptr) -{ - struct timespec tv; - int err; - - if (get_user(tv.tv_sec, tptr)) - return -EFAULT; - - tv.tv_nsec = 0; - - err = security_settime(&tv, NULL); - if (err) - return err; - - do_settimeofday(&tv); - return 0; -} - -#endif /* __ARCH_WANT_SYS_TIME */ - -SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, - struct timezone __user *, tz) -{ - if (likely(tv != NULL)) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (copy_to_user(tv, &ktv, sizeof(ktv))) - return -EFAULT; - } - if (unlikely(tz != NULL)) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - return 0; -} - -/* - * Indicates if there is an offset between the system clock and the hardware - * clock/persistent clock/rtc. - */ -int persistent_clock_is_local; - -/* - * Adjust the time obtained from the CMOS to be UTC time instead of - * local time. - * - * This is ugly, but preferable to the alternatives. Otherwise we - * would either need to write a program to do it in /etc/rc (and risk - * confusion if the program gets run more than once; it would also be - * hard to make the program warp the clock precisely n hours) or - * compile in the timezone information into the kernel. Bad, bad.... - * - * - TYT, 1992-01-01 - * - * The best thing to do is to keep the CMOS clock in universal time (UTC) - * as real UNIX machines always do it. This avoids all headaches about - * daylight saving times and warping kernel clocks. - */ -static inline void warp_clock(void) -{ - if (sys_tz.tz_minuteswest != 0) { - struct timespec adjust; - - persistent_clock_is_local = 1; - adjust.tv_sec = sys_tz.tz_minuteswest * 60; - adjust.tv_nsec = 0; - timekeeping_inject_offset(&adjust); - } -} - -/* - * In case for some reason the CMOS clock has not already been running - * in UTC, but in some local time: The first time we set the timezone, - * we will warp the clock so that it is ticking UTC time instead of - * local time. Presumably, if someone is setting the timezone then we - * are running in an environment where the programs understand about - * timezones. This should be done at boot time in the /etc/rc script, - * as soon as possible, so that the clock can be set right. Otherwise, - * various programs will get confused when the clock gets warped. - */ - -int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) -{ - static int firsttime = 1; - int error = 0; - - if (tv && !timespec_valid(tv)) - return -EINVAL; - - error = security_settime(tv, tz); - if (error) - return error; - - if (tz) { - sys_tz = *tz; - update_vsyscall_tz(); - if (firsttime) { - firsttime = 0; - if (!tv) - warp_clock(); - } - } - if (tv) - return do_settimeofday(tv); - return 0; -} - -SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, - struct timezone __user *, tz) -{ - struct timeval user_tv; - struct timespec new_ts; - struct timezone new_tz; - - if (tv) { - if (copy_from_user(&user_tv, tv, sizeof(*tv))) - return -EFAULT; - new_ts.tv_sec = user_tv.tv_sec; - new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; - } - if (tz) { - if (copy_from_user(&new_tz, tz, sizeof(*tz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); -} - -SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) -{ - struct timex txc; /* Local copy of parameter */ - int ret; - - /* Copy the user data space into the kernel copy - * structure. But bear in mind that the structures - * may change - */ - if(copy_from_user(&txc, txc_p, sizeof(struct timex))) - return -EFAULT; - ret = do_adjtimex(&txc); - return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; -} - -/** - * current_fs_time - Return FS time - * @sb: Superblock. - * - * Return the current time truncated to the time granularity supported by - * the fs. - */ -struct timespec current_fs_time(struct super_block *sb) -{ - struct timespec now = current_kernel_time(); - return timespec_trunc(now, sb->s_time_gran); -} -EXPORT_SYMBOL(current_fs_time); - -/* - * Convert jiffies to milliseconds and back. - * - * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases: - */ -unsigned int jiffies_to_msecs(const unsigned long j) -{ -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (MSEC_PER_SEC / HZ) * j; -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); -#else -# if BITS_PER_LONG == 32 - return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; -# else - return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; -# endif -#endif -} -EXPORT_SYMBOL(jiffies_to_msecs); - -unsigned int jiffies_to_usecs(const unsigned long j) -{ -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (USEC_PER_SEC / HZ) * j; -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); -#else -# if BITS_PER_LONG == 32 - return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; -# else - return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; -# endif -#endif -} -EXPORT_SYMBOL(jiffies_to_usecs); - -/** - * timespec_trunc - Truncate timespec to a granularity - * @t: Timespec - * @gran: Granularity in ns. - * - * Truncate a timespec to a granularity. gran must be smaller than a second. - * Always rounds down. - * - * This function should be only used for timestamps returned by - * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because - * it doesn't handle the better resolution of the latter. - */ -struct timespec timespec_trunc(struct timespec t, unsigned gran) -{ - /* - * Division is pretty slow so avoid it for common cases. - * Currently current_kernel_time() never returns better than - * jiffies resolution. Exploit that. - */ - if (gran <= jiffies_to_usecs(1) * 1000) { - /* nothing */ - } else if (gran == 1000000000) { - t.tv_nsec = 0; - } else { - t.tv_nsec -= t.tv_nsec % gran; - } - return t; -} -EXPORT_SYMBOL(timespec_trunc); - -/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. - * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 - * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. - * - * [For the Julian calendar (which was used in Russia before 1917, - * Britain & colonies before 1752, anywhere else before 1582, - * and is still in use by some communities) leave out the - * -year/100+year/400 terms, and add 10.] - * - * This algorithm was first published by Gauss (I think). - * - * WARNING: this function will overflow on 2106-02-07 06:28:16 on - * machines where long is 32-bit! (However, as time_t is signed, we - * will already get problems at other places on 2038-01-19 03:14:08) - */ -unsigned long -mktime(const unsigned int year0, const unsigned int mon0, - const unsigned int day, const unsigned int hour, - const unsigned int min, const unsigned int sec) -{ - unsigned int mon = mon0, year = year0; - - /* 1..12 -> 11,12,1..10 */ - if (0 >= (int) (mon -= 2)) { - mon += 12; /* Puts Feb last since it has leap day */ - year -= 1; - } - - return ((((unsigned long) - (year/4 - year/100 + year/400 + 367*mon/12 + day) + - year*365 - 719499 - )*24 + hour /* now have hours */ - )*60 + min /* now have minutes */ - )*60 + sec; /* finally seconds */ -} - -EXPORT_SYMBOL(mktime); - -/** - * set_normalized_timespec - set timespec sec and nsec parts and normalize - * - * @ts: pointer to timespec variable to be set - * @sec: seconds to set - * @nsec: nanoseconds to set - * - * Set seconds and nanoseconds field of a timespec variable and - * normalize to the timespec storage format - * - * Note: The tv_nsec part is always in the range of - * 0 <= tv_nsec < NSEC_PER_SEC - * For negative values only the tv_sec field is negative ! - */ -void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec) -{ - while (nsec >= NSEC_PER_SEC) { - /* - * The following asm() prevents the compiler from - * optimising this loop into a modulo operation. See - * also __iter_div_u64_rem() in include/linux/time.h - */ - asm("" : "+rm"(nsec)); - nsec -= NSEC_PER_SEC; - ++sec; - } - while (nsec < 0) { - asm("" : "+rm"(nsec)); - nsec += NSEC_PER_SEC; - --sec; - } - ts->tv_sec = sec; - ts->tv_nsec = nsec; -} -EXPORT_SYMBOL(set_normalized_timespec); - -/** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -struct timespec ns_to_timespec(const s64 nsec) -{ - struct timespec ts; - s32 rem; - - if (!nsec) - return (struct timespec) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; - } - ts.tv_nsec = rem; - - return ts; -} -EXPORT_SYMBOL(ns_to_timespec); - -/** - * ns_to_timeval - Convert nanoseconds to timeval - * @nsec: the nanoseconds value to be converted - * - * Returns the timeval representation of the nsec parameter. - */ -struct timeval ns_to_timeval(const s64 nsec) -{ - struct timespec ts = ns_to_timespec(nsec); - struct timeval tv; - - tv.tv_sec = ts.tv_sec; - tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; - - return tv; -} -EXPORT_SYMBOL(ns_to_timeval); - -/* - * When we convert to jiffies then we interpret incoming values - * the following way: - * - * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) - * - * - 'too large' values [that would result in larger than - * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. - * - * - all other values are converted to jiffies by either multiplying - * the input value by a factor or dividing it with a factor - * - * We must also be careful about 32-bit overflows. - */ -unsigned long msecs_to_jiffies(const unsigned int m) -{ - /* - * Negative value, means infinite timeout: - */ - if ((int)m < 0) - return MAX_JIFFY_OFFSET; - -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - /* - * HZ is equal to or smaller than 1000, and 1000 is a nice - * round multiple of HZ, divide with the factor between them, - * but round upwards: - */ - return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - /* - * HZ is larger than 1000, and HZ is a nice round multiple of - * 1000 - simply multiply with the factor between them. - * - * But first make sure the multiplication result cannot - * overflow: - */ - if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - - return m * (HZ / MSEC_PER_SEC); -#else - /* - * Generic case - multiply, round and divide. But first - * check that if we are doing a net multiplication, that - * we wouldn't overflow: - */ - if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - - return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) - >> MSEC_TO_HZ_SHR32; -#endif -} -EXPORT_SYMBOL(msecs_to_jiffies); - -unsigned long usecs_to_jiffies(const unsigned int u) -{ - if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return u * (HZ / USEC_PER_SEC); -#else - return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) - >> USEC_TO_HZ_SHR32; -#endif -} -EXPORT_SYMBOL(usecs_to_jiffies); - -/* - * The TICK_NSEC - 1 rounds up the value to the next resolution. Note - * that a remainder subtract here would not do the right thing as the - * resolution values don't fall on second boundries. I.e. the line: - * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. - * - * Rather, we just shift the bits off the right. - * - * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec - * value to a scaled second value. - */ -unsigned long -timespec_to_jiffies(const struct timespec *value) -{ - unsigned long sec = value->tv_sec; - long nsec = value->tv_nsec + TICK_NSEC - 1; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - nsec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)nsec * NSEC_CONVERSION) >> - (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; - -} -EXPORT_SYMBOL(timespec_to_jiffies); - -void -jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u32 rem; - value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, - NSEC_PER_SEC, &rem); - value->tv_nsec = rem; -} -EXPORT_SYMBOL(jiffies_to_timespec); - -/* Same for "timeval" - * - * Well, almost. The problem here is that the real system resolution is - * in nanoseconds and the value being converted is in micro seconds. - * Also for some machines (those that use HZ = 1024, in-particular), - * there is a LARGE error in the tick size in microseconds. - - * The solution we use is to do the rounding AFTER we convert the - * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. - * Instruction wise, this should cost only an additional add with carry - * instruction above the way it was done above. - */ -unsigned long -timeval_to_jiffies(const struct timeval *value) -{ - unsigned long sec = value->tv_sec; - long usec = value->tv_usec; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - usec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> - (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; -} -EXPORT_SYMBOL(timeval_to_jiffies); - -void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u32 rem; - - value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, - NSEC_PER_SEC, &rem); - value->tv_usec = rem / NSEC_PER_USEC; -} -EXPORT_SYMBOL(jiffies_to_timeval); - -/* - * Convert jiffies/jiffies_64 to clock_t and back. - */ -clock_t jiffies_to_clock_t(unsigned long x) -{ -#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 -# if HZ < USER_HZ - return x * (USER_HZ / HZ); -# else - return x / (HZ / USER_HZ); -# endif -#else - return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); -#endif -} -EXPORT_SYMBOL(jiffies_to_clock_t); - -unsigned long clock_t_to_jiffies(unsigned long x) -{ -#if (HZ % USER_HZ)==0 - if (x >= ~0UL / (HZ / USER_HZ)) - return ~0UL; - return x * (HZ / USER_HZ); -#else - /* Don't worry about loss of precision here .. */ - if (x >= ~0UL / HZ * USER_HZ) - return ~0UL; - - /* .. but do try to contain it here */ - return div_u64((u64)x * HZ, USER_HZ); -#endif -} -EXPORT_SYMBOL(clock_t_to_jiffies); - -u64 jiffies_64_to_clock_t(u64 x) -{ -#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 -# if HZ < USER_HZ - x = div_u64(x * USER_HZ, HZ); -# elif HZ > USER_HZ - x = div_u64(x, HZ / USER_HZ); -# else - /* Nothing to do */ -# endif -#else - /* - * There are better ways that don't overflow early, - * but even this doesn't overflow in hundreds of years - * in 64 bits, so.. - */ - x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); -#endif - return x; -} -EXPORT_SYMBOL(jiffies_64_to_clock_t); - -u64 nsec_to_clock_t(u64 x) -{ -#if (NSEC_PER_SEC % USER_HZ) == 0 - return div_u64(x, NSEC_PER_SEC / USER_HZ); -#elif (USER_HZ % 512) == 0 - return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); -#else - /* - * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, - * overflow after 64.99 years. - * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... - */ - return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); -#endif -} - -/** - * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 - * - * @n: nsecs in u64 - * - * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. - * And this doesn't return MAX_JIFFY_OFFSET since this function is designed - * for scheduler, not for use in device drivers to calculate timeout value. - * - * note: - * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) - * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years - */ -u64 nsecs_to_jiffies64(u64 n) -{ -#if (NSEC_PER_SEC % HZ) == 0 - /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ - return div_u64(n, NSEC_PER_SEC / HZ); -#elif (HZ % 512) == 0 - /* overflow after 292 years if HZ = 1024 */ - return div_u64(n * HZ / 512, NSEC_PER_SEC / 512); -#else - /* - * Generic case - optimized for cases where HZ is a multiple of 3. - * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc. - */ - return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); -#endif -} - -/** - * nsecs_to_jiffies - Convert nsecs in u64 to jiffies - * - * @n: nsecs in u64 - * - * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. - * And this doesn't return MAX_JIFFY_OFFSET since this function is designed - * for scheduler, not for use in device drivers to calculate timeout value. - * - * note: - * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) - * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years - */ -unsigned long nsecs_to_jiffies(u64 n) -{ - return (unsigned long)nsecs_to_jiffies64(n); -} - -/* - * Add two timespec values and do a safety check for overflow. - * It's assumed that both values are valid (>= 0) - */ -struct timespec timespec_add_safe(const struct timespec lhs, - const struct timespec rhs) -{ - struct timespec res; - - set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, - lhs.tv_nsec + rhs.tv_nsec); - - if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) - res.tv_sec = TIME_T_MAX; - - return res; -} diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 57a413f..e59ce8b 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,3 +1,4 @@ +obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o posix-clock.o alarmtimer.o @@ -12,3 +13,19 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o + +$(obj)/time.o: $(obj)/timeconst.h + +quiet_cmd_hzfile = HZFILE $@ + cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ + +targets += hz.bc +$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE + $(call if_changed,hzfile) + +quiet_cmd_bc = BC $@ + cmd_bc = bc -q $(filter-out FORCE,$^) > $@ + +targets += timeconst.h +$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE + $(call if_changed,bc) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c new file mode 100644 index 0000000..3ab2899 --- /dev/null +++ b/kernel/time/hrtimer.c @@ -0,0 +1,1915 @@ +/* + * linux/kernel/hrtimer.c + * + * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner + * + * High-resolution kernel timers + * + * In contrast to the low-resolution timeout API implemented in + * kernel/timer.c, hrtimers provide finer resolution and accuracy + * depending on system configuration and capabilities. + * + * These timers are currently used for: + * - itimers + * - POSIX timers + * - nanosleep + * - precise in-kernel timing + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * Credits: + * based on kernel/timer.c + * + * Help, testing, suggestions, bugfixes, improvements were + * provided by: + * + * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel + * et. al. + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +/* + * The timer bases: + * + * There are more clockids then hrtimer bases. Thus, we index + * into the timer bases by the hrtimer_base_type enum. When trying + * to reach a base using a clockid, hrtimer_clockid_to_base() + * is used to convert from clockid to the proper hrtimer_base_type. + */ +DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = +{ + + .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), + .clock_base = + { + { + .index = HRTIMER_BASE_MONOTONIC, + .clockid = CLOCK_MONOTONIC, + .get_time = &ktime_get, + .resolution = KTIME_LOW_RES, + }, + { + .index = HRTIMER_BASE_REALTIME, + .clockid = CLOCK_REALTIME, + .get_time = &ktime_get_real, + .resolution = KTIME_LOW_RES, + }, + { + .index = HRTIMER_BASE_BOOTTIME, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + .resolution = KTIME_LOW_RES, + }, + { + .index = HRTIMER_BASE_TAI, + .clockid = CLOCK_TAI, + .get_time = &ktime_get_clocktai, + .resolution = KTIME_LOW_RES, + }, + } +}; + +static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { + [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, + [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, + [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, + [CLOCK_TAI] = HRTIMER_BASE_TAI, +}; + +static inline int hrtimer_clockid_to_base(clockid_t clock_id) +{ + return hrtimer_clock_to_base_table[clock_id]; +} + + +/* + * Get the coarse grained time at the softirq based on xtime and + * wall_to_monotonic. + */ +static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) +{ + ktime_t xtim, mono, boot; + struct timespec xts, tom, slp; + s32 tai_offset; + + get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); + tai_offset = timekeeping_get_tai_offset(); + + xtim = timespec_to_ktime(xts); + mono = ktime_add(xtim, timespec_to_ktime(tom)); + boot = ktime_add(mono, timespec_to_ktime(slp)); + base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; + base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; + base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; + base->clock_base[HRTIMER_BASE_TAI].softirq_time = + ktime_add(xtim, ktime_set(tai_offset, 0)); +} + +/* + * Functions and macros which are different for UP/SMP systems are kept in a + * single place + */ +#ifdef CONFIG_SMP + +/* + * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock + * means that all timers which are tied to this base via timer->base are + * locked, and the base itself is locked too. + * + * So __run_timers/migrate_timers can safely modify all timers which could + * be found on the lists/queues. + * + * When the timer's base is locked, and the timer removed from list, it is + * possible to set timer->base = NULL and drop the lock: the timer remains + * locked. + */ +static +struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + unsigned long *flags) +{ + struct hrtimer_clock_base *base; + + for (;;) { + base = timer->base; + if (likely(base != NULL)) { + raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); + if (likely(base == timer->base)) + return base; + /* The timer has migrated to another CPU: */ + raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); + } + cpu_relax(); + } +} + +/* + * With HIGHRES=y we do not migrate the timer when it is expiring + * before the next event on the target cpu because we cannot reprogram + * the target cpu hardware and we would cause it to fire late. + * + * Called with cpu_base->lock of target cpu held. + */ +static int +hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + ktime_t expires; + + if (!new_base->cpu_base->hres_active) + return 0; + + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); + return expires.tv64 <= new_base->cpu_base->expires_next.tv64; +#else + return 0; +#endif +} + +/* + * Switch the timer base to the current CPU when possible. + */ +static inline struct hrtimer_clock_base * +switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, + int pinned) +{ + struct hrtimer_clock_base *new_base; + struct hrtimer_cpu_base *new_cpu_base; + int this_cpu = smp_processor_id(); + int cpu = get_nohz_timer_target(pinned); + int basenum = base->index; + +again: + new_cpu_base = &per_cpu(hrtimer_bases, cpu); + new_base = &new_cpu_base->clock_base[basenum]; + + if (base != new_base) { + /* + * We are trying to move timer to new_base. + * However we can't change timer's base while it is running, + * so we keep it on the same CPU. No hassle vs. reprogramming + * the event source in the high resolution case. The softirq + * code will take care of this when the timer function has + * completed. There is no conflict as we hold the lock until + * the timer is enqueued. + */ + if (unlikely(hrtimer_callback_running(timer))) + return base; + + /* See the comment in lock_timer_base() */ + timer->base = NULL; + raw_spin_unlock(&base->cpu_base->lock); + raw_spin_lock(&new_base->cpu_base->lock); + + if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { + cpu = this_cpu; + raw_spin_unlock(&new_base->cpu_base->lock); + raw_spin_lock(&base->cpu_base->lock); + timer->base = base; + goto again; + } + timer->base = new_base; + } else { + if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { + cpu = this_cpu; + goto again; + } + } + return new_base; +} + +#else /* CONFIG_SMP */ + +static inline struct hrtimer_clock_base * +lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) +{ + struct hrtimer_clock_base *base = timer->base; + + raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); + + return base; +} + +# define switch_hrtimer_base(t, b, p) (b) + +#endif /* !CONFIG_SMP */ + +/* + * Functions for the union type storage format of ktime_t which are + * too large for inlining: + */ +#if BITS_PER_LONG < 64 +# ifndef CONFIG_KTIME_SCALAR +/** + * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable + * @kt: addend + * @nsec: the scalar nsec value to add + * + * Returns the sum of kt and nsec in ktime_t format + */ +ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) +{ + ktime_t tmp; + + if (likely(nsec < NSEC_PER_SEC)) { + tmp.tv64 = nsec; + } else { + unsigned long rem = do_div(nsec, NSEC_PER_SEC); + + /* Make sure nsec fits into long */ + if (unlikely(nsec > KTIME_SEC_MAX)) + return (ktime_t){ .tv64 = KTIME_MAX }; + + tmp = ktime_set((long)nsec, rem); + } + + return ktime_add(kt, tmp); +} + +EXPORT_SYMBOL_GPL(ktime_add_ns); + +/** + * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable + * @kt: minuend + * @nsec: the scalar nsec value to subtract + * + * Returns the subtraction of @nsec from @kt in ktime_t format + */ +ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec) +{ + ktime_t tmp; + + if (likely(nsec < NSEC_PER_SEC)) { + tmp.tv64 = nsec; + } else { + unsigned long rem = do_div(nsec, NSEC_PER_SEC); + + tmp = ktime_set((long)nsec, rem); + } + + return ktime_sub(kt, tmp); +} + +EXPORT_SYMBOL_GPL(ktime_sub_ns); +# endif /* !CONFIG_KTIME_SCALAR */ + +/* + * Divide a ktime value by a nanosecond value + */ +u64 ktime_divns(const ktime_t kt, s64 div) +{ + u64 dclc; + int sft = 0; + + dclc = ktime_to_ns(kt); + /* Make sure the divisor is less than 2^32: */ + while (div >> 32) { + sft++; + div >>= 1; + } + dclc >>= sft; + do_div(dclc, (unsigned long) div); + + return dclc; +} +#endif /* BITS_PER_LONG >= 64 */ + +/* + * Add two ktime values and do a safety check for overflow: + */ +ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) +{ + ktime_t res = ktime_add(lhs, rhs); + + /* + * We use KTIME_SEC_MAX here, the maximum timeout which we can + * return to user space in a timespec: + */ + if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64) + res = ktime_set(KTIME_SEC_MAX, 0); + + return res; +} + +EXPORT_SYMBOL_GPL(ktime_add_safe); + +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS + +static struct debug_obj_descr hrtimer_debug_descr; + +static void *hrtimer_debug_hint(void *addr) +{ + return ((struct hrtimer *) addr)->function; +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) +{ + struct hrtimer *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + hrtimer_cancel(timer); + debug_object_init(timer, &hrtimer_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + */ +static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) +{ + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + WARN_ON_ONCE(1); + return 0; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) +{ + struct hrtimer *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + hrtimer_cancel(timer); + debug_object_free(timer, &hrtimer_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr hrtimer_debug_descr = { + .name = "hrtimer", + .debug_hint = hrtimer_debug_hint, + .fixup_init = hrtimer_fixup_init, + .fixup_activate = hrtimer_fixup_activate, + .fixup_free = hrtimer_fixup_free, +}; + +static inline void debug_hrtimer_init(struct hrtimer *timer) +{ + debug_object_init(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_activate(struct hrtimer *timer) +{ + debug_object_activate(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_deactivate(struct hrtimer *timer) +{ + debug_object_deactivate(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_free(struct hrtimer *timer) +{ + debug_object_free(timer, &hrtimer_debug_descr); +} + +static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode); + +void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + debug_object_init_on_stack(timer, &hrtimer_debug_descr); + __hrtimer_init(timer, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); + +void destroy_hrtimer_on_stack(struct hrtimer *timer) +{ + debug_object_free(timer, &hrtimer_debug_descr); +} + +#else +static inline void debug_hrtimer_init(struct hrtimer *timer) { } +static inline void debug_hrtimer_activate(struct hrtimer *timer) { } +static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } +#endif + +static inline void +debug_init(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) +{ + debug_hrtimer_init(timer); + trace_hrtimer_init(timer, clockid, mode); +} + +static inline void debug_activate(struct hrtimer *timer) +{ + debug_hrtimer_activate(timer); + trace_hrtimer_start(timer); +} + +static inline void debug_deactivate(struct hrtimer *timer) +{ + debug_hrtimer_deactivate(timer); + trace_hrtimer_cancel(timer); +} + +/* High resolution timer related functions */ +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer enabled ? + */ +static int hrtimer_hres_enabled __read_mostly = 1; + +/* + * Enable / Disable high resolution mode + */ +static int __init setup_hrtimer_hres(char *str) +{ + if (!strcmp(str, "off")) + hrtimer_hres_enabled = 0; + else if (!strcmp(str, "on")) + hrtimer_hres_enabled = 1; + else + return 0; + return 1; +} + +__setup("highres=", setup_hrtimer_hres); + +/* + * hrtimer_high_res_enabled - query, if the highres mode is enabled + */ +static inline int hrtimer_is_hres_enabled(void) +{ + return hrtimer_hres_enabled; +} + +/* + * Is the high resolution mode active ? + */ +static inline int hrtimer_hres_active(void) +{ + return __this_cpu_read(hrtimer_bases.hres_active); +} + +/* + * Reprogram the event source with checking both queues for the + * next event + * Called with interrupts disabled and base->lock held + */ +static void +hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +{ + int i; + struct hrtimer_clock_base *base = cpu_base->clock_base; + ktime_t expires, expires_next; + + expires_next.tv64 = KTIME_MAX; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + struct hrtimer *timer; + struct timerqueue_node *next; + + next = timerqueue_getnext(&base->active); + if (!next) + continue; + timer = container_of(next, struct hrtimer, node); + + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + /* + * clock_was_set() has changed base->offset so the + * result might be negative. Fix it up to prevent a + * false positive in clockevents_program_event() + */ + if (expires.tv64 < 0) + expires.tv64 = 0; + if (expires.tv64 < expires_next.tv64) + expires_next = expires; + } + + if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) + return; + + cpu_base->expires_next.tv64 = expires_next.tv64; + + /* + * If a hang was detected in the last timer interrupt then we + * leave the hang delay active in the hardware. We want the + * system to make progress. That also prevents the following + * scenario: + * T1 expires 50ms from now + * T2 expires 5s from now + * + * T1 is removed, so this code is called and would reprogram + * the hardware to 5s from now. Any hrtimer_start after that + * will not reprogram the hardware due to hang_detected being + * set. So we'd effectivly block all timers until the T2 event + * fires. + */ + if (cpu_base->hang_detected) + return; + + if (cpu_base->expires_next.tv64 != KTIME_MAX) + tick_program_event(cpu_base->expires_next, 1); +} + +/* + * Shared reprogramming for clock_realtime and clock_monotonic + * + * When a timer is enqueued and expires earlier than the already enqueued + * timers, we have to check, whether it expires earlier than the timer for + * which the clock event device was armed. + * + * Called with interrupts disabled and base->cpu_base.lock held + */ +static int hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + int res; + + WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); + + /* + * When the callback is running, we do not reprogram the clock event + * device. The timer callback is either running on a different CPU or + * the callback is executed in the hrtimer_interrupt context. The + * reprogramming is handled either by the softirq, which called the + * callback or at the end of the hrtimer_interrupt. + */ + if (hrtimer_callback_running(timer)) + return 0; + + /* + * CLOCK_REALTIME timer might be requested with an absolute + * expiry time which is less than base->offset. Nothing wrong + * about that, just avoid to call into the tick code, which + * has now objections against negative expiry values. + */ + if (expires.tv64 < 0) + return -ETIME; + + if (expires.tv64 >= cpu_base->expires_next.tv64) + return 0; + + /* + * If a hang was detected in the last timer interrupt then we + * do not schedule a timer which is earlier than the expiry + * which we enforced in the hang detection. We want the system + * to make progress. + */ + if (cpu_base->hang_detected) + return 0; + + /* + * Clockevents returns -ETIME, when the event was in the past. + */ + res = tick_program_event(expires, 0); + if (!IS_ERR_VALUE(res)) + cpu_base->expires_next = expires; + return res; +} + +/* + * Initialize the high resolution related parts of cpu_base + */ +static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) +{ + base->expires_next.tv64 = KTIME_MAX; + base->hres_active = 0; +} + +/* + * When High resolution timers are active, try to reprogram. Note, that in case + * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry + * check happens. The timer gets enqueued into the rbtree. The reprogramming + * and expiry check is done in the hrtimer_interrupt or in the softirq. + */ +static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + return base->cpu_base->hres_active && hrtimer_reprogram(timer, base); +} + +static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) +{ + ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; + + return ktime_get_update_offsets(offs_real, offs_boot, offs_tai); +} + +/* + * Retrigger next event is called after clock was set + * + * Called with interrupts disabled via on_each_cpu() + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + + if (!hrtimer_hres_active()) + return; + + raw_spin_lock(&base->lock); + hrtimer_update_base(base); + hrtimer_force_reprogram(base, 0); + raw_spin_unlock(&base->lock); +} + +/* + * Switch to high resolution mode + */ +static int hrtimer_switch_to_hres(void) +{ + int i, cpu = smp_processor_id(); + struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); + unsigned long flags; + + if (base->hres_active) + return 1; + + local_irq_save(flags); + + if (tick_init_highres()) { + local_irq_restore(flags); + printk(KERN_WARNING "Could not switch to high resolution " + "mode on CPU %d\n", cpu); + return 0; + } + base->hres_active = 1; + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + base->clock_base[i].resolution = KTIME_HIGH_RES; + + tick_setup_sched_timer(); + /* "Retrigger" the interrupt to get things going */ + retrigger_next_event(NULL); + local_irq_restore(flags); + return 1; +} + +static void clock_was_set_work(struct work_struct *work) +{ + clock_was_set(); +} + +static DECLARE_WORK(hrtimer_work, clock_was_set_work); + +/* + * Called from timekeeping and resume code to reprogramm the hrtimer + * interrupt device on all cpus. + */ +void clock_was_set_delayed(void) +{ + schedule_work(&hrtimer_work); +} + +#else + +static inline int hrtimer_hres_active(void) { return 0; } +static inline int hrtimer_is_hres_enabled(void) { return 0; } +static inline int hrtimer_switch_to_hres(void) { return 0; } +static inline void +hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } +static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + return 0; +} +static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } +static inline void retrigger_next_event(void *arg) { } + +#endif /* CONFIG_HIGH_RES_TIMERS */ + +/* + * Clock realtime was set + * + * Change the offset of the realtime clock vs. the monotonic + * clock. + * + * We might have to reprogram the high resolution timer interrupt. On + * SMP we call the architecture specific code to retrigger _all_ high + * resolution timer interrupts. On UP we just disable interrupts and + * call the high resolution interrupt code. + */ +void clock_was_set(void) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + /* Retrigger the CPU local events everywhere */ + on_each_cpu(retrigger_next_event, NULL, 1); +#endif + timerfd_clock_was_set(); +} + +/* + * During resume we might have to reprogram the high resolution timer + * interrupt on all online CPUs. However, all other CPUs will be + * stopped with IRQs interrupts disabled so the clock_was_set() call + * must be deferred. + */ +void hrtimers_resume(void) +{ + WARN_ONCE(!irqs_disabled(), + KERN_INFO "hrtimers_resume() called with IRQs enabled!"); + + /* Retrigger on the local CPU */ + retrigger_next_event(NULL); + /* And schedule a retrigger for all others */ + clock_was_set_delayed(); +} + +static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + if (timer->start_site) + return; + timer->start_site = __builtin_return_address(0); + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +#endif +} + +static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; +#endif +} + +static inline void timer_stats_account_hrtimer(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + if (likely(!timer_stats_active)) + return; + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm, 0); +#endif +} + +/* + * Counterpart to lock_hrtimer_base above: + */ +static inline +void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) +{ + raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); +} + +/** + * hrtimer_forward - forward the timer expiry + * @timer: hrtimer to forward + * @now: forward past this time + * @interval: the interval to forward + * + * Forward the timer expiry so it will expire in the future. + * Returns the number of overruns. + */ +u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) +{ + u64 orun = 1; + ktime_t delta; + + delta = ktime_sub(now, hrtimer_get_expires(timer)); + + if (delta.tv64 < 0) + return 0; + + if (interval.tv64 < timer->base->resolution.tv64) + interval.tv64 = timer->base->resolution.tv64; + + if (unlikely(delta.tv64 >= interval.tv64)) { + s64 incr = ktime_to_ns(interval); + + orun = ktime_divns(delta, incr); + hrtimer_add_expires_ns(timer, incr * orun); + if (hrtimer_get_expires_tv64(timer) > now.tv64) + return orun; + /* + * This (and the ktime_add() below) is the + * correction for exact: + */ + orun++; + } + hrtimer_add_expires(timer, interval); + + return orun; +} +EXPORT_SYMBOL_GPL(hrtimer_forward); + +/* + * enqueue_hrtimer - internal function to (re)start a timer + * + * The timer is inserted in expiry order. Insertion into the + * red black tree is O(log(n)). Must hold the base lock. + * + * Returns 1 when the new timer is the leftmost timer in the tree. + */ +static int enqueue_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + debug_activate(timer); + + timerqueue_add(&base->active, &timer->node); + base->cpu_base->active_bases |= 1 << base->index; + + /* + * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the + * state of a possibly running callback. + */ + timer->state |= HRTIMER_STATE_ENQUEUED; + + return (&timer->node == base->active.next); +} + +/* + * __remove_hrtimer - internal function to remove a timer + * + * Caller must hold the base lock. + * + * High resolution timer mode reprograms the clock event device when the + * timer is the one which expires next. The caller can disable this by setting + * reprogram to zero. This is useful, when the context does a reprogramming + * anyway (e.g. timer interrupt) + */ +static void __remove_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base, + unsigned long newstate, int reprogram) +{ + struct timerqueue_node *next_timer; + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) + goto out; + + next_timer = timerqueue_getnext(&base->active); + timerqueue_del(&base->active, &timer->node); + if (&timer->node == next_timer) { +#ifdef CONFIG_HIGH_RES_TIMERS + /* Reprogram the clock event device. if enabled */ + if (reprogram && hrtimer_hres_active()) { + ktime_t expires; + + expires = ktime_sub(hrtimer_get_expires(timer), + base->offset); + if (base->cpu_base->expires_next.tv64 == expires.tv64) + hrtimer_force_reprogram(base->cpu_base, 1); + } +#endif + } + if (!timerqueue_getnext(&base->active)) + base->cpu_base->active_bases &= ~(1 << base->index); +out: + timer->state = newstate; +} + +/* + * remove hrtimer, called with base lock held + */ +static inline int +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) +{ + if (hrtimer_is_queued(timer)) { + unsigned long state; + int reprogram; + + /* + * Remove the timer and force reprogramming when high + * resolution mode is active and the timer is on the current + * CPU. If we remove a timer on another CPU, reprogramming is + * skipped. The interrupt event on this CPU is fired and + * reprogramming happens in the interrupt handler. This is a + * rare case and less expensive than a smp call. + */ + debug_deactivate(timer); + timer_stats_hrtimer_clear_start_info(timer); + reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); + /* + * We must preserve the CALLBACK state flag here, + * otherwise we could move the timer base in + * switch_hrtimer_base. + */ + state = timer->state & HRTIMER_STATE_CALLBACK; + __remove_hrtimer(timer, base, state, reprogram); + return 1; + } + return 0; +} + +int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode, + int wakeup) +{ + struct hrtimer_clock_base *base, *new_base; + unsigned long flags; + int ret, leftmost; + + base = lock_hrtimer_base(timer, &flags); + + /* Remove an active timer from the queue: */ + ret = remove_hrtimer(timer, base); + + if (mode & HRTIMER_MODE_REL) { + tim = ktime_add_safe(tim, base->get_time()); + /* + * CONFIG_TIME_LOW_RES is a temporary way for architectures + * to signal that they simply return xtime in + * do_gettimeoffset(). In this case we want to round up by + * resolution when starting a relative timer, to avoid short + * timeouts. This will go away with the GTOD framework. + */ +#ifdef CONFIG_TIME_LOW_RES + tim = ktime_add_safe(tim, base->resolution); +#endif + } + + hrtimer_set_expires_range_ns(timer, tim, delta_ns); + + /* Switch the timer base, if necessary: */ + new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); + + timer_stats_hrtimer_set_start_info(timer); + + leftmost = enqueue_hrtimer(timer, new_base); + + /* + * Only allow reprogramming if the new base is on this CPU. + * (it might still be on another CPU if the timer was pending) + * + * XXX send_remote_softirq() ? + */ + if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) + && hrtimer_enqueue_reprogram(timer, new_base)) { + if (wakeup) { + /* + * We need to drop cpu_base->lock to avoid a + * lock ordering issue vs. rq->lock. + */ + raw_spin_unlock(&new_base->cpu_base->lock); + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + local_irq_restore(flags); + return ret; + } else { + __raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + } + + unlock_hrtimer_base(timer, &flags); + + return ret; +} +EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns); + +/** + * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) + * + * Returns: + * 0 on success + * 1 when the timer was active + */ +int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode) +{ + return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1); +} +EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); + +/** + * hrtimer_start - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) + * + * Returns: + * 0 on success + * 1 when the timer was active + */ +int +hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) +{ + return __hrtimer_start_range_ns(timer, tim, 0, mode, 1); +} +EXPORT_SYMBOL_GPL(hrtimer_start); + + +/** + * hrtimer_try_to_cancel - try to deactivate a timer + * @timer: hrtimer to stop + * + * Returns: + * 0 when the timer was not active + * 1 when the timer was active + * -1 when the timer is currently excuting the callback function and + * cannot be stopped + */ +int hrtimer_try_to_cancel(struct hrtimer *timer) +{ + struct hrtimer_clock_base *base; + unsigned long flags; + int ret = -1; + + base = lock_hrtimer_base(timer, &flags); + + if (!hrtimer_callback_running(timer)) + ret = remove_hrtimer(timer, base); + + unlock_hrtimer_base(timer, &flags); + + return ret; + +} +EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); + +/** + * hrtimer_cancel - cancel a timer and wait for the handler to finish. + * @timer: the timer to be cancelled + * + * Returns: + * 0 when the timer was not active + * 1 when the timer was active + */ +int hrtimer_cancel(struct hrtimer *timer) +{ + for (;;) { + int ret = hrtimer_try_to_cancel(timer); + + if (ret >= 0) + return ret; + cpu_relax(); + } +} +EXPORT_SYMBOL_GPL(hrtimer_cancel); + +/** + * hrtimer_get_remaining - get remaining time for the timer + * @timer: the timer to read + */ +ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +{ + unsigned long flags; + ktime_t rem; + + lock_hrtimer_base(timer, &flags); + rem = hrtimer_expires_remaining(timer); + unlock_hrtimer_base(timer, &flags); + + return rem; +} +EXPORT_SYMBOL_GPL(hrtimer_get_remaining); + +#ifdef CONFIG_NO_HZ_COMMON +/** + * hrtimer_get_next_event - get the time until next expiry event + * + * Returns the delta to the next expiry event or KTIME_MAX if no timer + * is pending. + */ +ktime_t hrtimer_get_next_event(void) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_clock_base *base = cpu_base->clock_base; + ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; + unsigned long flags; + int i; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (!hrtimer_hres_active()) { + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + struct hrtimer *timer; + struct timerqueue_node *next; + + next = timerqueue_getnext(&base->active); + if (!next) + continue; + + timer = container_of(next, struct hrtimer, node); + delta.tv64 = hrtimer_get_expires_tv64(timer); + delta = ktime_sub(delta, base->get_time()); + if (delta.tv64 < mindelta.tv64) + mindelta.tv64 = delta.tv64; + } + } + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + if (mindelta.tv64 < 0) + mindelta.tv64 = 0; + return mindelta; +} +#endif + +static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + struct hrtimer_cpu_base *cpu_base; + int base; + + memset(timer, 0, sizeof(struct hrtimer)); + + cpu_base = &__raw_get_cpu_var(hrtimer_bases); + + if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) + clock_id = CLOCK_MONOTONIC; + + base = hrtimer_clockid_to_base(clock_id); + timer->base = &cpu_base->clock_base[base]; + timerqueue_init(&timer->node); + +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif +} + +/** + * hrtimer_init - initialize a timer to the given clock + * @timer: the timer to be initialized + * @clock_id: the clock to be used + * @mode: timer mode abs/rel + */ +void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + debug_init(timer, clock_id, mode); + __hrtimer_init(timer, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_init); + +/** + * hrtimer_get_res - get the timer resolution for a clock + * @which_clock: which clock to query + * @tp: pointer to timespec variable to store the resolution + * + * Store the resolution of the clock selected by @which_clock in the + * variable pointed to by @tp. + */ +int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) +{ + struct hrtimer_cpu_base *cpu_base; + int base = hrtimer_clockid_to_base(which_clock); + + cpu_base = &__raw_get_cpu_var(hrtimer_bases); + *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); + + return 0; +} +EXPORT_SYMBOL_GPL(hrtimer_get_res); + +static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) +{ + struct hrtimer_clock_base *base = timer->base; + struct hrtimer_cpu_base *cpu_base = base->cpu_base; + enum hrtimer_restart (*fn)(struct hrtimer *); + int restart; + + WARN_ON(!irqs_disabled()); + + debug_deactivate(timer); + __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + timer_stats_account_hrtimer(timer); + fn = timer->function; + + /* + * Because we run timers from hardirq context, there is no chance + * they get migrated to another cpu, therefore its safe to unlock + * the timer base. + */ + raw_spin_unlock(&cpu_base->lock); + trace_hrtimer_expire_entry(timer, now); + restart = fn(timer); + trace_hrtimer_expire_exit(timer); + raw_spin_lock(&cpu_base->lock); + + /* + * Note: We clear the CALLBACK bit after enqueue_hrtimer and + * we do not reprogramm the event hardware. Happens either in + * hrtimer_start_range_ns() or in hrtimer_interrupt() + */ + if (restart != HRTIMER_NORESTART) { + BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); + enqueue_hrtimer(timer, base); + } + + WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); + + timer->state &= ~HRTIMER_STATE_CALLBACK; +} + +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +void hrtimer_interrupt(struct clock_event_device *dev) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + ktime_t expires_next, now, entry_time, delta; + int i, retries = 0; + + BUG_ON(!cpu_base->hres_active); + cpu_base->nr_events++; + dev->next_event.tv64 = KTIME_MAX; + + raw_spin_lock(&cpu_base->lock); + entry_time = now = hrtimer_update_base(cpu_base); +retry: + expires_next.tv64 = KTIME_MAX; + /* + * We set expires_next to KTIME_MAX here with cpu_base->lock + * held to prevent that a timer is enqueued in our queue via + * the migration code. This does not affect enqueueing of + * timers which run their callback and need to be requeued on + * this CPU. + */ + cpu_base->expires_next.tv64 = KTIME_MAX; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + struct hrtimer_clock_base *base; + struct timerqueue_node *node; + ktime_t basenow; + + if (!(cpu_base->active_bases & (1 << i))) + continue; + + base = cpu_base->clock_base + i; + basenow = ktime_add(now, base->offset); + + while ((node = timerqueue_getnext(&base->active))) { + struct hrtimer *timer; + + timer = container_of(node, struct hrtimer, node); + + /* + * The immediate goal for using the softexpires is + * minimizing wakeups, not running timers at the + * earliest interrupt after their soft expiration. + * This allows us to avoid using a Priority Search + * Tree, which can answer a stabbing querry for + * overlapping intervals and instead use the simple + * BST we already have. + * We don't add extra wakeups by delaying timers that + * are right-of a not yet expired timer, because that + * timer will have to trigger a wakeup anyway. + */ + + if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) { + ktime_t expires; + + expires = ktime_sub(hrtimer_get_expires(timer), + base->offset); + if (expires.tv64 < 0) + expires.tv64 = KTIME_MAX; + if (expires.tv64 < expires_next.tv64) + expires_next = expires; + break; + } + + __run_hrtimer(timer, &basenow); + } + } + + /* + * Store the new expiry value so the migration code can verify + * against it. + */ + cpu_base->expires_next = expires_next; + raw_spin_unlock(&cpu_base->lock); + + /* Reprogramming necessary ? */ + if (expires_next.tv64 == KTIME_MAX || + !tick_program_event(expires_next, 0)) { + cpu_base->hang_detected = 0; + return; + } + + /* + * The next timer was already expired due to: + * - tracing + * - long lasting callbacks + * - being scheduled away when running in a VM + * + * We need to prevent that we loop forever in the hrtimer + * interrupt routine. We give it 3 attempts to avoid + * overreacting on some spurious event. + * + * Acquire base lock for updating the offsets and retrieving + * the current time. + */ + raw_spin_lock(&cpu_base->lock); + now = hrtimer_update_base(cpu_base); + cpu_base->nr_retries++; + if (++retries < 3) + goto retry; + /* + * Give the system a chance to do something else than looping + * here. We stored the entry time, so we know exactly how long + * we spent here. We schedule the next event this amount of + * time away. + */ + cpu_base->nr_hangs++; + cpu_base->hang_detected = 1; + raw_spin_unlock(&cpu_base->lock); + delta = ktime_sub(now, entry_time); + if (delta.tv64 > cpu_base->max_hang_time.tv64) + cpu_base->max_hang_time = delta; + /* + * Limit it to a sensible value as we enforce a longer + * delay. Give the CPU at least 100ms to catch up. + */ + if (delta.tv64 > 100 * NSEC_PER_MSEC) + expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); + else + expires_next = ktime_add(now, delta); + tick_program_event(expires_next, 1); + printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n", + ktime_to_ns(delta)); +} + +/* + * local version of hrtimer_peek_ahead_timers() called with interrupts + * disabled. + */ +static void __hrtimer_peek_ahead_timers(void) +{ + struct tick_device *td; + + if (!hrtimer_hres_active()) + return; + + td = &__get_cpu_var(tick_cpu_device); + if (td && td->evtdev) + hrtimer_interrupt(td->evtdev); +} + +/** + * hrtimer_peek_ahead_timers -- run soft-expired timers now + * + * hrtimer_peek_ahead_timers will peek at the timer queue of + * the current cpu and check if there are any timers for which + * the soft expires time has passed. If any such timers exist, + * they are run immediately and then removed from the timer queue. + * + */ +void hrtimer_peek_ahead_timers(void) +{ + unsigned long flags; + + local_irq_save(flags); + __hrtimer_peek_ahead_timers(); + local_irq_restore(flags); +} + +static void run_hrtimer_softirq(struct softirq_action *h) +{ + hrtimer_peek_ahead_timers(); +} + +#else /* CONFIG_HIGH_RES_TIMERS */ + +static inline void __hrtimer_peek_ahead_timers(void) { } + +#endif /* !CONFIG_HIGH_RES_TIMERS */ + +/* + * Called from timer softirq every jiffy, expire hrtimers: + * + * For HRT its the fall back code to run the softirq in the timer + * softirq context in case the hrtimer initialization failed or has + * not been done yet. + */ +void hrtimer_run_pending(void) +{ + if (hrtimer_hres_active()) + return; + + /* + * This _is_ ugly: We have to check in the softirq context, + * whether we can switch to highres and / or nohz mode. The + * clocksource switch happens in the timer interrupt with + * xtime_lock held. Notification from there only sets the + * check bit in the tick_oneshot code, otherwise we might + * deadlock vs. xtime_lock. + */ + if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) + hrtimer_switch_to_hres(); +} + +/* + * Called from hardirq context every jiffy + */ +void hrtimer_run_queues(void) +{ + struct timerqueue_node *node; + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_clock_base *base; + int index, gettime = 1; + + if (hrtimer_hres_active()) + return; + + for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { + base = &cpu_base->clock_base[index]; + if (!timerqueue_getnext(&base->active)) + continue; + + if (gettime) { + hrtimer_get_softirq_time(cpu_base); + gettime = 0; + } + + raw_spin_lock(&cpu_base->lock); + + while ((node = timerqueue_getnext(&base->active))) { + struct hrtimer *timer; + + timer = container_of(node, struct hrtimer, node); + if (base->softirq_time.tv64 <= + hrtimer_get_expires_tv64(timer)) + break; + + __run_hrtimer(timer, &base->softirq_time); + } + raw_spin_unlock(&cpu_base->lock); + } +} + +/* + * Sleep related functions: + */ +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) +{ + struct hrtimer_sleeper *t = + container_of(timer, struct hrtimer_sleeper, timer); + struct task_struct *task = t->task; + + t->task = NULL; + if (task) + wake_up_process(task); + + return HRTIMER_NORESTART; +} + +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) +{ + sl->timer.function = hrtimer_wakeup; + sl->task = task; +} +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); + +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) +{ + hrtimer_init_sleeper(t, current); + + do { + set_current_state(TASK_INTERRUPTIBLE); + hrtimer_start_expires(&t->timer, mode); + if (!hrtimer_active(&t->timer)) + t->task = NULL; + + if (likely(t->task)) + freezable_schedule(); + + hrtimer_cancel(&t->timer); + mode = HRTIMER_MODE_ABS; + + } while (t->task && !signal_pending(current)); + + __set_current_state(TASK_RUNNING); + + return t->task == NULL; +} + +static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp) +{ + struct timespec rmt; + ktime_t rem; + + rem = hrtimer_expires_remaining(timer); + if (rem.tv64 <= 0) + return 0; + rmt = ktime_to_timespec(rem); + + if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) + return -EFAULT; + + return 1; +} + +long __sched hrtimer_nanosleep_restart(struct restart_block *restart) +{ + struct hrtimer_sleeper t; + struct timespec __user *rmtp; + int ret = 0; + + hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, + HRTIMER_MODE_ABS); + hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); + + if (do_nanosleep(&t, HRTIMER_MODE_ABS)) + goto out; + + rmtp = restart->nanosleep.rmtp; + if (rmtp) { + ret = update_rmtp(&t.timer, rmtp); + if (ret <= 0) + goto out; + } + + /* The other values in restart are already filled in */ + ret = -ERESTART_RESTARTBLOCK; +out: + destroy_hrtimer_on_stack(&t.timer); + return ret; +} + +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, + const enum hrtimer_mode mode, const clockid_t clockid) +{ + struct restart_block *restart; + struct hrtimer_sleeper t; + int ret = 0; + unsigned long slack; + + slack = current->timer_slack_ns; + if (dl_task(current) || rt_task(current)) + slack = 0; + + hrtimer_init_on_stack(&t.timer, clockid, mode); + hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack); + if (do_nanosleep(&t, mode)) + goto out; + + /* Absolute timers do not update the rmtp value and restart: */ + if (mode == HRTIMER_MODE_ABS) { + ret = -ERESTARTNOHAND; + goto out; + } + + if (rmtp) { + ret = update_rmtp(&t.timer, rmtp); + if (ret <= 0) + goto out; + } + + restart = ¤t_thread_info()->restart_block; + restart->fn = hrtimer_nanosleep_restart; + restart->nanosleep.clockid = t.timer.base->clockid; + restart->nanosleep.rmtp = rmtp; + restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + + ret = -ERESTART_RESTARTBLOCK; +out: + destroy_hrtimer_on_stack(&t.timer); + return ret; +} + +SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, + struct timespec __user *, rmtp) +{ + struct timespec tu; + + if (copy_from_user(&tu, rqtp, sizeof(tu))) + return -EFAULT; + + if (!timespec_valid(&tu)) + return -EINVAL; + + return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); +} + +/* + * Functions related to boot-time initialization: + */ +static void init_hrtimers_cpu(int cpu) +{ + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + int i; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + cpu_base->clock_base[i].cpu_base = cpu_base; + timerqueue_init_head(&cpu_base->clock_base[i].active); + } + + hrtimer_init_hres(cpu_base); +} + +#ifdef CONFIG_HOTPLUG_CPU + +static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base) +{ + struct hrtimer *timer; + struct timerqueue_node *node; + + while ((node = timerqueue_getnext(&old_base->active))) { + timer = container_of(node, struct hrtimer, node); + BUG_ON(hrtimer_callback_running(timer)); + debug_deactivate(timer); + + /* + * Mark it as STATE_MIGRATE not INACTIVE otherwise the + * timer could be seen as !active and just vanish away + * under us on another CPU + */ + __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); + timer->base = new_base; + /* + * Enqueue the timers on the new cpu. This does not + * reprogram the event device in case the timer + * expires before the earliest on this CPU, but we run + * hrtimer_interrupt after we migrated everything to + * sort out already expired timers and reprogram the + * event device. + */ + enqueue_hrtimer(timer, new_base); + + /* Clear the migration state bit */ + timer->state &= ~HRTIMER_STATE_MIGRATE; + } +} + +static void migrate_hrtimers(int scpu) +{ + struct hrtimer_cpu_base *old_base, *new_base; + int i; + + BUG_ON(cpu_online(scpu)); + tick_cancel_sched_timer(scpu); + + local_irq_disable(); + old_base = &per_cpu(hrtimer_bases, scpu); + new_base = &__get_cpu_var(hrtimer_bases); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + raw_spin_lock(&new_base->lock); + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i]); + } + + raw_spin_unlock(&old_base->lock); + raw_spin_unlock(&new_base->lock); + + /* Check, if we got expired work to do */ + __hrtimer_peek_ahead_timers(); + local_irq_enable(); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +static int hrtimer_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int scpu = (long)hcpu; + + switch (action) { + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + init_hrtimers_cpu(scpu); + break; + +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DYING: + case CPU_DYING_FROZEN: + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + { + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); + migrate_hrtimers(scpu); + break; + } +#endif + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block hrtimers_nb = { + .notifier_call = hrtimer_cpu_notify, +}; + +void __init hrtimers_init(void) +{ + hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&hrtimers_nb); +#ifdef CONFIG_HIGH_RES_TIMERS + open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); +#endif +} + +/** + * schedule_hrtimeout_range_clock - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME + */ +int __sched +schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, + const enum hrtimer_mode mode, int clock) +{ + struct hrtimer_sleeper t; + + /* + * Optimize when a zero timeout value is given. It does not + * matter whether this is an absolute or a relative time. + */ + if (expires && !expires->tv64) { + __set_current_state(TASK_RUNNING); + return 0; + } + + /* + * A NULL parameter means "infinite" + */ + if (!expires) { + schedule(); + __set_current_state(TASK_RUNNING); + return -EINTR; + } + + hrtimer_init_on_stack(&t.timer, clock, mode); + hrtimer_set_expires_range_ns(&t.timer, *expires, delta); + + hrtimer_init_sleeper(&t, current); + + hrtimer_start_expires(&t.timer, mode); + if (!hrtimer_active(&t.timer)) + t.task = NULL; + + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + + __set_current_state(TASK_RUNNING); + + return !t.task ? 0 : -EINTR; +} + +/** + * schedule_hrtimeout_range - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * The @delta argument gives the kernel the freedom to schedule the + * actual wakeup to a time that is both power and performance friendly. + * The kernel give the normal best effort behavior for "@expires+@delta", + * but may decide to fire the timer earlier, but no earlier than @expires. + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns. + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns 0 when the timer has expired otherwise -EINTR + */ +int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range_clock(expires, delta, mode, + CLOCK_MONOTONIC); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); + +/** + * schedule_hrtimeout - sleep until timeout + * @expires: timeout value (ktime_t) + * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns. + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns 0 when the timer has expired otherwise -EINTR + */ +int __sched schedule_hrtimeout(ktime_t *expires, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range(expires, 0, mode); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout); diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c new file mode 100644 index 0000000..8d262b4 --- /dev/null +++ b/kernel/time/itimer.c @@ -0,0 +1,301 @@ +/* + * linux/kernel/itimer.c + * + * Copyright (C) 1992 Darren Senn + */ + +/* These are all the functions necessary to implement itimers */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +/** + * itimer_get_remtime - get remaining time for the timer + * + * @timer: the timer to read + * + * Returns the delta between the expiry time and now, which can be + * less than zero or 1usec for an pending expired timer + */ +static struct timeval itimer_get_remtime(struct hrtimer *timer) +{ + ktime_t rem = hrtimer_get_remaining(timer); + + /* + * Racy but safe: if the itimer expires after the above + * hrtimer_get_remtime() call but before this condition + * then we return 0 - which is correct. + */ + if (hrtimer_active(timer)) { + if (rem.tv64 <= 0) + rem.tv64 = NSEC_PER_USEC; + } else + rem.tv64 = 0; + + return ktime_to_timeval(rem); +} + +static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + struct itimerval *const value) +{ + cputime_t cval, cinterval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (cval) { + struct task_cputime cputime; + cputime_t t; + + thread_group_cputimer(tsk, &cputime); + if (clock_id == CPUCLOCK_PROF) + t = cputime.utime + cputime.stime; + else + /* CPUCLOCK_VIRT */ + t = cputime.utime; + + if (cval < t) + /* about to fire */ + cval = cputime_one_jiffy; + else + cval = cval - t; + } + + spin_unlock_irq(&tsk->sighand->siglock); + + cputime_to_timeval(cval, &value->it_value); + cputime_to_timeval(cinterval, &value->it_interval); +} + +int do_getitimer(int which, struct itimerval *value) +{ + struct task_struct *tsk = current; + + switch (which) { + case ITIMER_REAL: + spin_lock_irq(&tsk->sighand->siglock); + value->it_value = itimer_get_remtime(&tsk->signal->real_timer); + value->it_interval = + ktime_to_timeval(tsk->signal->it_real_incr); + spin_unlock_irq(&tsk->sighand->siglock); + break; + case ITIMER_VIRTUAL: + get_cpu_itimer(tsk, CPUCLOCK_VIRT, value); + break; + case ITIMER_PROF: + get_cpu_itimer(tsk, CPUCLOCK_PROF, value); + break; + default: + return(-EINVAL); + } + return 0; +} + +SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) +{ + int error = -EFAULT; + struct itimerval get_buffer; + + if (value) { + error = do_getitimer(which, &get_buffer); + if (!error && + copy_to_user(value, &get_buffer, sizeof(get_buffer))) + error = -EFAULT; + } + return error; +} + + +/* + * The timer is automagically restarted, when interval != 0 + */ +enum hrtimer_restart it_real_fn(struct hrtimer *timer) +{ + struct signal_struct *sig = + container_of(timer, struct signal_struct, real_timer); + + trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); + kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); + + return HRTIMER_NORESTART; +} + +static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns) +{ + struct timespec ts; + s64 cpu_ns; + + cputime_to_timespec(ct, &ts); + cpu_ns = timespec_to_ns(&ts); + + return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns; +} + +static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + const struct itimerval *const value, + struct itimerval *const ovalue) +{ + cputime_t cval, nval, cinterval, ninterval; + s64 ns_ninterval, ns_nval; + u32 error, incr_error; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + nval = timeval_to_cputime(&value->it_value); + ns_nval = timeval_to_ns(&value->it_value); + ninterval = timeval_to_cputime(&value->it_interval); + ns_ninterval = timeval_to_ns(&value->it_interval); + + error = cputime_sub_ns(nval, ns_nval); + incr_error = cputime_sub_ns(ninterval, ns_ninterval); + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (cval || nval) { + if (nval > 0) + nval += cputime_one_jiffy; + set_process_cpu_timer(tsk, clock_id, &nval, &cval); + } + it->expires = nval; + it->incr = ninterval; + it->error = error; + it->incr_error = incr_error; + trace_itimer_state(clock_id == CPUCLOCK_VIRT ? + ITIMER_VIRTUAL : ITIMER_PROF, value, nval); + + spin_unlock_irq(&tsk->sighand->siglock); + + if (ovalue) { + cputime_to_timeval(cval, &ovalue->it_value); + cputime_to_timeval(cinterval, &ovalue->it_interval); + } +} + +/* + * Returns true if the timeval is in canonical form + */ +#define timeval_valid(t) \ + (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) + +int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) +{ + struct task_struct *tsk = current; + struct hrtimer *timer; + ktime_t expires; + + /* + * Validate the timevals in value. + */ + if (!timeval_valid(&value->it_value) || + !timeval_valid(&value->it_interval)) + return -EINVAL; + + switch (which) { + case ITIMER_REAL: +again: + spin_lock_irq(&tsk->sighand->siglock); + timer = &tsk->signal->real_timer; + if (ovalue) { + ovalue->it_value = itimer_get_remtime(timer); + ovalue->it_interval + = ktime_to_timeval(tsk->signal->it_real_incr); + } + /* We are sharing ->siglock with it_real_fn() */ + if (hrtimer_try_to_cancel(timer) < 0) { + spin_unlock_irq(&tsk->sighand->siglock); + goto again; + } + expires = timeval_to_ktime(value->it_value); + if (expires.tv64 != 0) { + tsk->signal->it_real_incr = + timeval_to_ktime(value->it_interval); + hrtimer_start(timer, expires, HRTIMER_MODE_REL); + } else + tsk->signal->it_real_incr.tv64 = 0; + + trace_itimer_state(ITIMER_REAL, value, 0); + spin_unlock_irq(&tsk->sighand->siglock); + break; + case ITIMER_VIRTUAL: + set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue); + break; + case ITIMER_PROF: + set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue); + break; + default: + return -EINVAL; + } + return 0; +} + +/** + * alarm_setitimer - set alarm in seconds + * + * @seconds: number of seconds until alarm + * 0 disables the alarm + * + * Returns the remaining time in seconds of a pending timer or 0 when + * the timer is not active. + * + * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid + * negative timeval settings which would cause immediate expiry. + */ +unsigned int alarm_setitimer(unsigned int seconds) +{ + struct itimerval it_new, it_old; + +#if BITS_PER_LONG < 64 + if (seconds > INT_MAX) + seconds = INT_MAX; +#endif + it_new.it_value.tv_sec = seconds; + it_new.it_value.tv_usec = 0; + it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + + do_setitimer(ITIMER_REAL, &it_new, &it_old); + + /* + * We can't return 0 if we have an alarm pending ... And we'd + * better return too much than too little anyway + */ + if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) || + it_old.it_value.tv_usec >= 500000) + it_old.it_value.tv_sec++; + + return it_old.it_value.tv_sec; +} + +SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, + struct itimerval __user *, ovalue) +{ + struct itimerval set_buffer, get_buffer; + int error; + + if (value) { + if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) + return -EFAULT; + } else { + memset(&set_buffer, 0, sizeof(set_buffer)); + printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." + " Misfeature support will be removed\n", + current->comm); + } + + error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); + if (error || !ovalue) + return error; + + if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer))) + return -EFAULT; + return 0; +} diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c new file mode 100644 index 0000000..3b89464 --- /dev/null +++ b/kernel/time/posix-cpu-timers.c @@ -0,0 +1,1490 @@ +/* + * Implement CPU time clocks for the POSIX clock interface. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Called after updating RLIMIT_CPU to run cpu timer and update + * tsk->signal->cputime_expires expiration cache if necessary. Needs + * siglock protection since other code may update expiration cache as + * well. + */ +void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) +{ + cputime_t cputime = secs_to_cputime(rlim_new); + + spin_lock_irq(&task->sighand->siglock); + set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL); + spin_unlock_irq(&task->sighand->siglock); +} + +static int check_clock(const clockid_t which_clock) +{ + int error = 0; + struct task_struct *p; + const pid_t pid = CPUCLOCK_PID(which_clock); + + if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX) + return -EINVAL; + + if (pid == 0) + return 0; + + rcu_read_lock(); + p = find_task_by_vpid(pid); + if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? + same_thread_group(p, current) : has_group_leader_pid(p))) { + error = -EINVAL; + } + rcu_read_unlock(); + + return error; +} + +static inline unsigned long long +timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) +{ + unsigned long long ret; + + ret = 0; /* high half always zero when .cpu used */ + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; + } else { + ret = cputime_to_expires(timespec_to_cputime(tp)); + } + return ret; +} + +static void sample_to_timespec(const clockid_t which_clock, + unsigned long long expires, + struct timespec *tp) +{ + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) + *tp = ns_to_timespec(expires); + else + cputime_to_timespec((__force cputime_t)expires, tp); +} + +/* + * Update expiry time from increment, and increase overrun count, + * given the current clock sample. + */ +static void bump_cpu_timer(struct k_itimer *timer, + unsigned long long now) +{ + int i; + unsigned long long delta, incr; + + if (timer->it.cpu.incr == 0) + return; + + if (now < timer->it.cpu.expires) + return; + + incr = timer->it.cpu.incr; + delta = now + incr - timer->it.cpu.expires; + + /* Don't use (incr*2 < delta), incr*2 might overflow. */ + for (i = 0; incr < delta - incr; i++) + incr = incr << 1; + + for (; i >= 0; incr >>= 1, i--) { + if (delta < incr) + continue; + + timer->it.cpu.expires += incr; + timer->it_overrun += 1 << i; + delta -= incr; + } +} + +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime: The struct to compare. + * + * Checks @cputime to see if all fields are zero. Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ + if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) + return 1; + return 0; +} + +static inline unsigned long long prof_ticks(struct task_struct *p) +{ + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + + return cputime_to_expires(utime + stime); +} +static inline unsigned long long virt_ticks(struct task_struct *p) +{ + cputime_t utime; + + task_cputime(p, &utime, NULL); + + return cputime_to_expires(utime); +} + +static int +posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) +{ + int error = check_clock(which_clock); + if (!error) { + tp->tv_sec = 0; + tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ); + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + /* + * If sched_clock is using a cycle counter, we + * don't have any idea of its true resolution + * exported, but it is much more than 1s/HZ. + */ + tp->tv_nsec = 1; + } + } + return error; +} + +static int +posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) +{ + /* + * You can never reset a CPU clock, but we check for other errors + * in the call before failing with EPERM. + */ + int error = check_clock(which_clock); + if (error == 0) { + error = -EPERM; + } + return error; +} + + +/* + * Sample a per-thread clock for the given task. + */ +static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, + unsigned long long *sample) +{ + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + *sample = prof_ticks(p); + break; + case CPUCLOCK_VIRT: + *sample = virt_ticks(p); + break; + case CPUCLOCK_SCHED: + *sample = task_sched_runtime(p); + break; + } + return 0; +} + +static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) +{ + if (b->utime > a->utime) + a->utime = b->utime; + + if (b->stime > a->stime) + a->stime = b->stime; + + if (b->sum_exec_runtime > a->sum_exec_runtime) + a->sum_exec_runtime = b->sum_exec_runtime; +} + +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct task_cputime sum; + unsigned long flags; + + if (!cputimer->running) { + /* + * The POSIX timer interface allows for absolute time expiry + * values through the TIMER_ABSTIME flag, therefore we have + * to synchronize the timer to the clock every time we start + * it. + */ + thread_group_cputime(tsk, &sum); + raw_spin_lock_irqsave(&cputimer->lock, flags); + cputimer->running = 1; + update_gt_cputime(&cputimer->cputime, &sum); + } else + raw_spin_lock_irqsave(&cputimer->lock, flags); + *times = cputimer->cputime; + raw_spin_unlock_irqrestore(&cputimer->lock, flags); +} + +/* + * Sample a process (thread group) clock for the given group_leader task. + * Must be called with task sighand lock held for safe while_each_thread() + * traversal. + */ +static int cpu_clock_sample_group(const clockid_t which_clock, + struct task_struct *p, + unsigned long long *sample) +{ + struct task_cputime cputime; + + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + thread_group_cputime(p, &cputime); + *sample = cputime_to_expires(cputime.utime + cputime.stime); + break; + case CPUCLOCK_VIRT: + thread_group_cputime(p, &cputime); + *sample = cputime_to_expires(cputime.utime); + break; + case CPUCLOCK_SCHED: + thread_group_cputime(p, &cputime); + *sample = cputime.sum_exec_runtime; + break; + } + return 0; +} + +static int posix_cpu_clock_get_task(struct task_struct *tsk, + const clockid_t which_clock, + struct timespec *tp) +{ + int err = -EINVAL; + unsigned long long rtn; + + if (CPUCLOCK_PERTHREAD(which_clock)) { + if (same_thread_group(tsk, current)) + err = cpu_clock_sample(which_clock, tsk, &rtn); + } else { + unsigned long flags; + struct sighand_struct *sighand; + + /* + * while_each_thread() is not yet entirely RCU safe, + * keep locking the group while sampling process + * clock for now. + */ + sighand = lock_task_sighand(tsk, &flags); + if (!sighand) + return err; + + if (tsk == current || thread_group_leader(tsk)) + err = cpu_clock_sample_group(which_clock, tsk, &rtn); + + unlock_task_sighand(tsk, &flags); + } + + if (!err) + sample_to_timespec(which_clock, rtn, tp); + + return err; +} + + +static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) +{ + const pid_t pid = CPUCLOCK_PID(which_clock); + int err = -EINVAL; + + if (pid == 0) { + /* + * Special case constant value for our own clocks. + * We don't have to do any lookup to find ourselves. + */ + err = posix_cpu_clock_get_task(current, which_clock, tp); + } else { + /* + * Find the given PID, and validate that the caller + * should be able to see it. + */ + struct task_struct *p; + rcu_read_lock(); + p = find_task_by_vpid(pid); + if (p) + err = posix_cpu_clock_get_task(p, which_clock, tp); + rcu_read_unlock(); + } + + return err; +} + + +/* + * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. + * This is called from sys_timer_create() and do_cpu_nanosleep() with the + * new timer already all-zeros initialized. + */ +static int posix_cpu_timer_create(struct k_itimer *new_timer) +{ + int ret = 0; + const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); + struct task_struct *p; + + if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) + return -EINVAL; + + INIT_LIST_HEAD(&new_timer->it.cpu.entry); + + rcu_read_lock(); + if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { + if (pid == 0) { + p = current; + } else { + p = find_task_by_vpid(pid); + if (p && !same_thread_group(p, current)) + p = NULL; + } + } else { + if (pid == 0) { + p = current->group_leader; + } else { + p = find_task_by_vpid(pid); + if (p && !has_group_leader_pid(p)) + p = NULL; + } + } + new_timer->it.cpu.task = p; + if (p) { + get_task_struct(p); + } else { + ret = -EINVAL; + } + rcu_read_unlock(); + + return ret; +} + +/* + * Clean up a CPU-clock timer that is about to be destroyed. + * This is called from timer deletion with the timer already locked. + * If we return TIMER_RETRY, it's necessary to release the timer's lock + * and try again. (This happens when the timer is in the middle of firing.) + */ +static int posix_cpu_timer_del(struct k_itimer *timer) +{ + int ret = 0; + unsigned long flags; + struct sighand_struct *sighand; + struct task_struct *p = timer->it.cpu.task; + + WARN_ON_ONCE(p == NULL); + + /* + * Protect against sighand release/switch in exit/exec and process/ + * thread timer list entry concurrent read/writes. + */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) { + /* + * We raced with the reaping of the task. + * The deletion should have cleared us off the list. + */ + WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry)); + } else { + if (timer->it.cpu.firing) + ret = TIMER_RETRY; + else + list_del(&timer->it.cpu.entry); + + unlock_task_sighand(p, &flags); + } + + if (!ret) + put_task_struct(p); + + return ret; +} + +static void cleanup_timers_list(struct list_head *head) +{ + struct cpu_timer_list *timer, *next; + + list_for_each_entry_safe(timer, next, head, entry) + list_del_init(&timer->entry); +} + +/* + * Clean out CPU timers still ticking when a thread exited. The task + * pointer is cleared, and the expiry time is replaced with the residual + * time for later timer_gettime calls to return. + * This must be called with the siglock held. + */ +static void cleanup_timers(struct list_head *head) +{ + cleanup_timers_list(head); + cleanup_timers_list(++head); + cleanup_timers_list(++head); +} + +/* + * These are both called with the siglock held, when the current thread + * is being reaped. When the final (leader) thread in the group is reaped, + * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit. + */ +void posix_cpu_timers_exit(struct task_struct *tsk) +{ + add_device_randomness((const void*) &tsk->se.sum_exec_runtime, + sizeof(unsigned long long)); + cleanup_timers(tsk->cpu_timers); + +} +void posix_cpu_timers_exit_group(struct task_struct *tsk) +{ + cleanup_timers(tsk->signal->cpu_timers); +} + +static inline int expires_gt(cputime_t expires, cputime_t new_exp) +{ + return expires == 0 || expires > new_exp; +} + +/* + * Insert the timer on the appropriate list before any timers that + * expire later. This must be called with the sighand lock held. + */ +static void arm_timer(struct k_itimer *timer) +{ + struct task_struct *p = timer->it.cpu.task; + struct list_head *head, *listpos; + struct task_cputime *cputime_expires; + struct cpu_timer_list *const nt = &timer->it.cpu; + struct cpu_timer_list *next; + + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + head = p->cpu_timers; + cputime_expires = &p->cputime_expires; + } else { + head = p->signal->cpu_timers; + cputime_expires = &p->signal->cputime_expires; + } + head += CPUCLOCK_WHICH(timer->it_clock); + + listpos = head; + list_for_each_entry(next, head, entry) { + if (nt->expires < next->expires) + break; + listpos = &next->entry; + } + list_add(&nt->entry, listpos); + + if (listpos == head) { + unsigned long long exp = nt->expires; + + /* + * We are the new earliest-expiring POSIX 1.b timer, hence + * need to update expiration cache. Take into account that + * for process timers we share expiration cache with itimers + * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. + */ + + switch (CPUCLOCK_WHICH(timer->it_clock)) { + case CPUCLOCK_PROF: + if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp))) + cputime_expires->prof_exp = expires_to_cputime(exp); + break; + case CPUCLOCK_VIRT: + if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp))) + cputime_expires->virt_exp = expires_to_cputime(exp); + break; + case CPUCLOCK_SCHED: + if (cputime_expires->sched_exp == 0 || + cputime_expires->sched_exp > exp) + cputime_expires->sched_exp = exp; + break; + } + } +} + +/* + * The timer is locked, fire it and arrange for its reload. + */ +static void cpu_timer_fire(struct k_itimer *timer) +{ + if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { + /* + * User don't want any signal. + */ + timer->it.cpu.expires = 0; + } else if (unlikely(timer->sigq == NULL)) { + /* + * This a special case for clock_nanosleep, + * not a normal timer from sys_timer_create. + */ + wake_up_process(timer->it_process); + timer->it.cpu.expires = 0; + } else if (timer->it.cpu.incr == 0) { + /* + * One-shot timer. Clear it as soon as it's fired. + */ + posix_timer_event(timer, 0); + timer->it.cpu.expires = 0; + } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { + /* + * The signal did not get queued because the signal + * was ignored, so we won't get any callback to + * reload the timer. But we need to keep it + * ticking in case the signal is deliverable next time. + */ + posix_cpu_timer_schedule(timer); + } +} + +/* + * Sample a process (thread group) timer for the given group_leader task. + * Must be called with task sighand lock held for safe while_each_thread() + * traversal. + */ +static int cpu_timer_sample_group(const clockid_t which_clock, + struct task_struct *p, + unsigned long long *sample) +{ + struct task_cputime cputime; + + thread_group_cputimer(p, &cputime); + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + *sample = cputime_to_expires(cputime.utime + cputime.stime); + break; + case CPUCLOCK_VIRT: + *sample = cputime_to_expires(cputime.utime); + break; + case CPUCLOCK_SCHED: + *sample = cputime.sum_exec_runtime + task_delta_exec(p); + break; + } + return 0; +} + +#ifdef CONFIG_NO_HZ_FULL +static void nohz_kick_work_fn(struct work_struct *work) +{ + tick_nohz_full_kick_all(); +} + +static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); + +/* + * We need the IPIs to be sent from sane process context. + * The posix cpu timers are always set with irqs disabled. + */ +static void posix_cpu_timer_kick_nohz(void) +{ + if (context_tracking_is_enabled()) + schedule_work(&nohz_kick_work); +} + +bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) +{ + if (!task_cputime_zero(&tsk->cputime_expires)) + return false; + + if (tsk->signal->cputimer.running) + return false; + + return true; +} +#else +static inline void posix_cpu_timer_kick_nohz(void) { } +#endif + +/* + * Guts of sys_timer_settime for CPU timers. + * This is called with the timer locked and interrupts disabled. + * If we return TIMER_RETRY, it's necessary to release the timer's lock + * and try again. (This happens when the timer is in the middle of firing.) + */ +static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, + struct itimerspec *new, struct itimerspec *old) +{ + unsigned long flags; + struct sighand_struct *sighand; + struct task_struct *p = timer->it.cpu.task; + unsigned long long old_expires, new_expires, old_incr, val; + int ret; + + WARN_ON_ONCE(p == NULL); + + new_expires = timespec_to_sample(timer->it_clock, &new->it_value); + + /* + * Protect against sighand release/switch in exit/exec and p->cpu_timers + * and p->signal->cpu_timers read/write in arm_timer() + */ + sighand = lock_task_sighand(p, &flags); + /* + * If p has just been reaped, we can no + * longer get any information about it at all. + */ + if (unlikely(sighand == NULL)) { + return -ESRCH; + } + + /* + * Disarm any old timer after extracting its expiry time. + */ + WARN_ON_ONCE(!irqs_disabled()); + + ret = 0; + old_incr = timer->it.cpu.incr; + old_expires = timer->it.cpu.expires; + if (unlikely(timer->it.cpu.firing)) { + timer->it.cpu.firing = -1; + ret = TIMER_RETRY; + } else + list_del_init(&timer->it.cpu.entry); + + /* + * We need to sample the current value to convert the new + * value from to relative and absolute, and to convert the + * old value from absolute to relative. To set a process + * timer, we need a sample to balance the thread expiry + * times (in arm_timer). With an absolute time, we must + * check if it's already passed. In short, we need a sample. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + cpu_clock_sample(timer->it_clock, p, &val); + } else { + cpu_timer_sample_group(timer->it_clock, p, &val); + } + + if (old) { + if (old_expires == 0) { + old->it_value.tv_sec = 0; + old->it_value.tv_nsec = 0; + } else { + /* + * Update the timer in case it has + * overrun already. If it has, + * we'll report it as having overrun + * and with the next reloaded timer + * already ticking, though we are + * swallowing that pending + * notification here to install the + * new setting. + */ + bump_cpu_timer(timer, val); + if (val < timer->it.cpu.expires) { + old_expires = timer->it.cpu.expires - val; + sample_to_timespec(timer->it_clock, + old_expires, + &old->it_value); + } else { + old->it_value.tv_nsec = 1; + old->it_value.tv_sec = 0; + } + } + } + + if (unlikely(ret)) { + /* + * We are colliding with the timer actually firing. + * Punt after filling in the timer's old value, and + * disable this firing since we are already reporting + * it as an overrun (thanks to bump_cpu_timer above). + */ + unlock_task_sighand(p, &flags); + goto out; + } + + if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) { + new_expires += val; + } + + /* + * Install the new expiry time (or zero). + * For a timer with no notification action, we don't actually + * arm the timer (we'll just fake it for timer_gettime). + */ + timer->it.cpu.expires = new_expires; + if (new_expires != 0 && val < new_expires) { + arm_timer(timer); + } + + unlock_task_sighand(p, &flags); + /* + * Install the new reload setting, and + * set up the signal and overrun bookkeeping. + */ + timer->it.cpu.incr = timespec_to_sample(timer->it_clock, + &new->it_interval); + + /* + * This acts as a modification timestamp for the timer, + * so any automatic reload attempt will punt on seeing + * that we have reset the timer manually. + */ + timer->it_requeue_pending = (timer->it_requeue_pending + 2) & + ~REQUEUE_PENDING; + timer->it_overrun_last = 0; + timer->it_overrun = -1; + + if (new_expires != 0 && !(val < new_expires)) { + /* + * The designated time already passed, so we notify + * immediately, even if the thread never runs to + * accumulate more time on this clock. + */ + cpu_timer_fire(timer); + } + + ret = 0; + out: + if (old) { + sample_to_timespec(timer->it_clock, + old_incr, &old->it_interval); + } + if (!ret) + posix_cpu_timer_kick_nohz(); + return ret; +} + +static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) +{ + unsigned long long now; + struct task_struct *p = timer->it.cpu.task; + + WARN_ON_ONCE(p == NULL); + + /* + * Easy part: convert the reload time. + */ + sample_to_timespec(timer->it_clock, + timer->it.cpu.incr, &itp->it_interval); + + if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ + itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; + return; + } + + /* + * Sample the clock to take the difference with the expiry time. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + cpu_clock_sample(timer->it_clock, p, &now); + } else { + struct sighand_struct *sighand; + unsigned long flags; + + /* + * Protect against sighand release/switch in exit/exec and + * also make timer sampling safe if it ends up calling + * thread_group_cputime(). + */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) { + /* + * The process has been reaped. + * We can't even collect a sample any more. + * Call the timer disarmed, nothing else to do. + */ + timer->it.cpu.expires = 0; + sample_to_timespec(timer->it_clock, timer->it.cpu.expires, + &itp->it_value); + } else { + cpu_timer_sample_group(timer->it_clock, p, &now); + unlock_task_sighand(p, &flags); + } + } + + if (now < timer->it.cpu.expires) { + sample_to_timespec(timer->it_clock, + timer->it.cpu.expires - now, + &itp->it_value); + } else { + /* + * The timer should have expired already, but the firing + * hasn't taken place yet. Say it's just about to expire. + */ + itp->it_value.tv_nsec = 1; + itp->it_value.tv_sec = 0; + } +} + +static unsigned long long +check_timers_list(struct list_head *timers, + struct list_head *firing, + unsigned long long curr) +{ + int maxfire = 20; + + while (!list_empty(timers)) { + struct cpu_timer_list *t; + + t = list_first_entry(timers, struct cpu_timer_list, entry); + + if (!--maxfire || curr < t->expires) + return t->expires; + + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + return 0; +} + +/* + * Check for any per-thread CPU timers that have fired and move them off + * the tsk->cpu_timers[N] list onto the firing list. Here we update the + * tsk->it_*_expires values to reflect the remaining thread CPU timers. + */ +static void check_thread_timers(struct task_struct *tsk, + struct list_head *firing) +{ + struct list_head *timers = tsk->cpu_timers; + struct signal_struct *const sig = tsk->signal; + struct task_cputime *tsk_expires = &tsk->cputime_expires; + unsigned long long expires; + unsigned long soft; + + expires = check_timers_list(timers, firing, prof_ticks(tsk)); + tsk_expires->prof_exp = expires_to_cputime(expires); + + expires = check_timers_list(++timers, firing, virt_ticks(tsk)); + tsk_expires->virt_exp = expires_to_cputime(expires); + + tsk_expires->sched_exp = check_timers_list(++timers, firing, + tsk->se.sum_exec_runtime); + + /* + * Check for the special case thread timers. + */ + soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); + if (soft != RLIM_INFINITY) { + unsigned long hard = + ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); + + if (hard != RLIM_INFINITY && + tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { + /* + * At the hard limit, we just die. + * No need to calculate anything else now. + */ + __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + return; + } + if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { + /* + * At the soft limit, send a SIGXCPU every second. + */ + if (soft < hard) { + soft += USEC_PER_SEC; + sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; + } + printk(KERN_INFO + "RT Watchdog Timeout: %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } +} + +static void stop_process_timers(struct signal_struct *sig) +{ + struct thread_group_cputimer *cputimer = &sig->cputimer; + unsigned long flags; + + raw_spin_lock_irqsave(&cputimer->lock, flags); + cputimer->running = 0; + raw_spin_unlock_irqrestore(&cputimer->lock, flags); +} + +static u32 onecputick; + +static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, + unsigned long long *expires, + unsigned long long cur_time, int signo) +{ + if (!it->expires) + return; + + if (cur_time >= it->expires) { + if (it->incr) { + it->expires += it->incr; + it->error += it->incr_error; + if (it->error >= onecputick) { + it->expires -= cputime_one_jiffy; + it->error -= onecputick; + } + } else { + it->expires = 0; + } + + trace_itimer_expire(signo == SIGPROF ? + ITIMER_PROF : ITIMER_VIRTUAL, + tsk->signal->leader_pid, cur_time); + __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); + } + + if (it->expires && (!*expires || it->expires < *expires)) { + *expires = it->expires; + } +} + +/* + * Check for any per-thread CPU timers that have fired and move them + * off the tsk->*_timers list onto the firing list. Per-thread timers + * have already been taken off. + */ +static void check_process_timers(struct task_struct *tsk, + struct list_head *firing) +{ + struct signal_struct *const sig = tsk->signal; + unsigned long long utime, ptime, virt_expires, prof_expires; + unsigned long long sum_sched_runtime, sched_expires; + struct list_head *timers = sig->cpu_timers; + struct task_cputime cputime; + unsigned long soft; + + /* + * Collect the current process totals. + */ + thread_group_cputimer(tsk, &cputime); + utime = cputime_to_expires(cputime.utime); + ptime = utime + cputime_to_expires(cputime.stime); + sum_sched_runtime = cputime.sum_exec_runtime; + + prof_expires = check_timers_list(timers, firing, ptime); + virt_expires = check_timers_list(++timers, firing, utime); + sched_expires = check_timers_list(++timers, firing, sum_sched_runtime); + + /* + * Check for the special case process timers. + */ + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, + SIGPROF); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, + SIGVTALRM); + soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + if (soft != RLIM_INFINITY) { + unsigned long psecs = cputime_to_secs(ptime); + unsigned long hard = + ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); + cputime_t x; + if (psecs >= hard) { + /* + * At the hard limit, we just die. + * No need to calculate anything else now. + */ + __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + return; + } + if (psecs >= soft) { + /* + * At the soft limit, send a SIGXCPU every second. + */ + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + if (soft < hard) { + soft++; + sig->rlim[RLIMIT_CPU].rlim_cur = soft; + } + } + x = secs_to_cputime(soft); + if (!prof_expires || x < prof_expires) { + prof_expires = x; + } + } + + sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires); + sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires); + sig->cputime_expires.sched_exp = sched_expires; + if (task_cputime_zero(&sig->cputime_expires)) + stop_process_timers(sig); +} + +/* + * This is called from the signal code (via do_schedule_next_timer) + * when the last timer signal was delivered and we have to reload the timer. + */ +void posix_cpu_timer_schedule(struct k_itimer *timer) +{ + struct sighand_struct *sighand; + unsigned long flags; + struct task_struct *p = timer->it.cpu.task; + unsigned long long now; + + WARN_ON_ONCE(p == NULL); + + /* + * Fetch the current sample and update the timer's expiry time. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + cpu_clock_sample(timer->it_clock, p, &now); + bump_cpu_timer(timer, now); + if (unlikely(p->exit_state)) + goto out; + + /* Protect timer list r/w in arm_timer() */ + sighand = lock_task_sighand(p, &flags); + if (!sighand) + goto out; + } else { + /* + * Protect arm_timer() and timer sampling in case of call to + * thread_group_cputime(). + */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) { + /* + * The process has been reaped. + * We can't even collect a sample any more. + */ + timer->it.cpu.expires = 0; + goto out; + } else if (unlikely(p->exit_state) && thread_group_empty(p)) { + unlock_task_sighand(p, &flags); + /* Optimizations: if the process is dying, no need to rearm */ + goto out; + } + cpu_timer_sample_group(timer->it_clock, p, &now); + bump_cpu_timer(timer, now); + /* Leave the sighand locked for the call below. */ + } + + /* + * Now re-arm for the new expiry time. + */ + WARN_ON_ONCE(!irqs_disabled()); + arm_timer(timer); + unlock_task_sighand(p, &flags); + + /* Kick full dynticks CPUs in case they need to tick on the new timer */ + posix_cpu_timer_kick_nohz(); +out: + timer->it_overrun_last = timer->it_overrun; + timer->it_overrun = -1; + ++timer->it_requeue_pending; +} + +/** + * task_cputime_expired - Compare two task_cputime entities. + * + * @sample: The task_cputime structure to be checked for expiration. + * @expires: Expiration times, against which @sample will be checked. + * + * Checks @sample against @expires to see if any field of @sample has expired. + * Returns true if any field of the former is greater than the corresponding + * field of the latter if the latter field is set. Otherwise returns false. + */ +static inline int task_cputime_expired(const struct task_cputime *sample, + const struct task_cputime *expires) +{ + if (expires->utime && sample->utime >= expires->utime) + return 1; + if (expires->stime && sample->utime + sample->stime >= expires->stime) + return 1; + if (expires->sum_exec_runtime != 0 && + sample->sum_exec_runtime >= expires->sum_exec_runtime) + return 1; + return 0; +} + +/** + * fastpath_timer_check - POSIX CPU timers fast path. + * + * @tsk: The task (thread) being checked. + * + * Check the task and thread group timers. If both are zero (there are no + * timers set) return false. Otherwise snapshot the task and thread group + * timers and compare them with the corresponding expiration times. Return + * true if a timer has expired, else return false. + */ +static inline int fastpath_timer_check(struct task_struct *tsk) +{ + struct signal_struct *sig; + cputime_t utime, stime; + + task_cputime(tsk, &utime, &stime); + + if (!task_cputime_zero(&tsk->cputime_expires)) { + struct task_cputime task_sample = { + .utime = utime, + .stime = stime, + .sum_exec_runtime = tsk->se.sum_exec_runtime + }; + + if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) + return 1; + } + + sig = tsk->signal; + if (sig->cputimer.running) { + struct task_cputime group_sample; + + raw_spin_lock(&sig->cputimer.lock); + group_sample = sig->cputimer.cputime; + raw_spin_unlock(&sig->cputimer.lock); + + if (task_cputime_expired(&group_sample, &sig->cputime_expires)) + return 1; + } + + return 0; +} + +/* + * This is called from the timer interrupt handler. The irq handler has + * already updated our counts. We need to check if any timers fire now. + * Interrupts are disabled. + */ +void run_posix_cpu_timers(struct task_struct *tsk) +{ + LIST_HEAD(firing); + struct k_itimer *timer, *next; + unsigned long flags; + + WARN_ON_ONCE(!irqs_disabled()); + + /* + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. + */ + if (!fastpath_timer_check(tsk)) + return; + + if (!lock_task_sighand(tsk, &flags)) + return; + /* + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. + */ + check_thread_timers(tsk, &firing); + /* + * If there are any active process wide timers (POSIX 1.b, itimers, + * RLIMIT_CPU) cputimer must be running. + */ + if (tsk->signal->cputimer.running) + check_process_timers(tsk, &firing); + + /* + * We must release these locks before taking any timer's lock. + * There is a potential race with timer deletion here, as the + * siglock now protects our private firing list. We have set + * the firing flag in each timer, so that a deletion attempt + * that gets the timer lock before we do will give it up and + * spin until we've taken care of that timer below. + */ + unlock_task_sighand(tsk, &flags); + + /* + * Now that all the timers on our list have the firing flag, + * no one will touch their list entries but us. We'll take + * each timer's lock before clearing its firing flag, so no + * timer call will interfere. + */ + list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { + int cpu_firing; + + spin_lock(&timer->it_lock); + list_del_init(&timer->it.cpu.entry); + cpu_firing = timer->it.cpu.firing; + timer->it.cpu.firing = 0; + /* + * The firing flag is -1 if we collided with a reset + * of the timer, which already reported this + * almost-firing as an overrun. So don't generate an event. + */ + if (likely(cpu_firing >= 0)) + cpu_timer_fire(timer); + spin_unlock(&timer->it_lock); + } +} + +/* + * Set one of the process-wide special case CPU timers or RLIMIT_CPU. + * The tsk->sighand->siglock must be held by the caller. + */ +void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, + cputime_t *newval, cputime_t *oldval) +{ + unsigned long long now; + + WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); + cpu_timer_sample_group(clock_idx, tsk, &now); + + if (oldval) { + /* + * We are setting itimer. The *oldval is absolute and we update + * it to be relative, *newval argument is relative and we update + * it to be absolute. + */ + if (*oldval) { + if (*oldval <= now) { + /* Just about to fire. */ + *oldval = cputime_one_jiffy; + } else { + *oldval -= now; + } + } + + if (!*newval) + goto out; + *newval += now; + } + + /* + * Update expiration cache if we are the earliest timer, or eventually + * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire. + */ + switch (clock_idx) { + case CPUCLOCK_PROF: + if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval)) + tsk->signal->cputime_expires.prof_exp = *newval; + break; + case CPUCLOCK_VIRT: + if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval)) + tsk->signal->cputime_expires.virt_exp = *newval; + break; + } +out: + posix_cpu_timer_kick_nohz(); +} + +static int do_cpu_nanosleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct itimerspec *it) +{ + struct k_itimer timer; + int error; + + /* + * Set up a temporary timer and then wait for it to go off. + */ + memset(&timer, 0, sizeof timer); + spin_lock_init(&timer.it_lock); + timer.it_clock = which_clock; + timer.it_overrun = -1; + error = posix_cpu_timer_create(&timer); + timer.it_process = current; + if (!error) { + static struct itimerspec zero_it; + + memset(it, 0, sizeof *it); + it->it_value = *rqtp; + + spin_lock_irq(&timer.it_lock); + error = posix_cpu_timer_set(&timer, flags, it, NULL); + if (error) { + spin_unlock_irq(&timer.it_lock); + return error; + } + + while (!signal_pending(current)) { + if (timer.it.cpu.expires == 0) { + /* + * Our timer fired and was reset, below + * deletion can not fail. + */ + posix_cpu_timer_del(&timer); + spin_unlock_irq(&timer.it_lock); + return 0; + } + + /* + * Block until cpu_timer_fire (or a signal) wakes us. + */ + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&timer.it_lock); + schedule(); + spin_lock_irq(&timer.it_lock); + } + + /* + * We were interrupted by a signal. + */ + sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); + error = posix_cpu_timer_set(&timer, 0, &zero_it, it); + if (!error) { + /* + * Timer is now unarmed, deletion can not fail. + */ + posix_cpu_timer_del(&timer); + } + spin_unlock_irq(&timer.it_lock); + + while (error == TIMER_RETRY) { + /* + * We need to handle case when timer was or is in the + * middle of firing. In other cases we already freed + * resources. + */ + spin_lock_irq(&timer.it_lock); + error = posix_cpu_timer_del(&timer); + spin_unlock_irq(&timer.it_lock); + } + + if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { + /* + * It actually did fire already. + */ + return 0; + } + + error = -ERESTART_RESTARTBLOCK; + } + + return error; +} + +static long posix_cpu_nsleep_restart(struct restart_block *restart_block); + +static int posix_cpu_nsleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct timespec __user *rmtp) +{ + struct restart_block *restart_block = + ¤t_thread_info()->restart_block; + struct itimerspec it; + int error; + + /* + * Diagnose required errors first. + */ + if (CPUCLOCK_PERTHREAD(which_clock) && + (CPUCLOCK_PID(which_clock) == 0 || + CPUCLOCK_PID(which_clock) == current->pid)) + return -EINVAL; + + error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); + + if (error == -ERESTART_RESTARTBLOCK) { + + if (flags & TIMER_ABSTIME) + return -ERESTARTNOHAND; + /* + * Report back to the user the time still remaining. + */ + if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + return -EFAULT; + + restart_block->fn = posix_cpu_nsleep_restart; + restart_block->nanosleep.clockid = which_clock; + restart_block->nanosleep.rmtp = rmtp; + restart_block->nanosleep.expires = timespec_to_ns(rqtp); + } + return error; +} + +static long posix_cpu_nsleep_restart(struct restart_block *restart_block) +{ + clockid_t which_clock = restart_block->nanosleep.clockid; + struct timespec t; + struct itimerspec it; + int error; + + t = ns_to_timespec(restart_block->nanosleep.expires); + + error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); + + if (error == -ERESTART_RESTARTBLOCK) { + struct timespec __user *rmtp = restart_block->nanosleep.rmtp; + /* + * Report back to the user the time still remaining. + */ + if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + return -EFAULT; + + restart_block->nanosleep.expires = timespec_to_ns(&t); + } + return error; + +} + +#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) +#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) + +static int process_cpu_clock_getres(const clockid_t which_clock, + struct timespec *tp) +{ + return posix_cpu_clock_getres(PROCESS_CLOCK, tp); +} +static int process_cpu_clock_get(const clockid_t which_clock, + struct timespec *tp) +{ + return posix_cpu_clock_get(PROCESS_CLOCK, tp); +} +static int process_cpu_timer_create(struct k_itimer *timer) +{ + timer->it_clock = PROCESS_CLOCK; + return posix_cpu_timer_create(timer); +} +static int process_cpu_nsleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, + struct timespec __user *rmtp) +{ + return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); +} +static long process_cpu_nsleep_restart(struct restart_block *restart_block) +{ + return -EINVAL; +} +static int thread_cpu_clock_getres(const clockid_t which_clock, + struct timespec *tp) +{ + return posix_cpu_clock_getres(THREAD_CLOCK, tp); +} +static int thread_cpu_clock_get(const clockid_t which_clock, + struct timespec *tp) +{ + return posix_cpu_clock_get(THREAD_CLOCK, tp); +} +static int thread_cpu_timer_create(struct k_itimer *timer) +{ + timer->it_clock = THREAD_CLOCK; + return posix_cpu_timer_create(timer); +} + +struct k_clock clock_posix_cpu = { + .clock_getres = posix_cpu_clock_getres, + .clock_set = posix_cpu_clock_set, + .clock_get = posix_cpu_clock_get, + .timer_create = posix_cpu_timer_create, + .nsleep = posix_cpu_nsleep, + .nsleep_restart = posix_cpu_nsleep_restart, + .timer_set = posix_cpu_timer_set, + .timer_del = posix_cpu_timer_del, + .timer_get = posix_cpu_timer_get, +}; + +static __init int init_posix_cpu_timers(void) +{ + struct k_clock process = { + .clock_getres = process_cpu_clock_getres, + .clock_get = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, + .nsleep_restart = process_cpu_nsleep_restart, + }; + struct k_clock thread = { + .clock_getres = thread_cpu_clock_getres, + .clock_get = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, + }; + struct timespec ts; + + posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); + posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); + + cputime_to_timespec(cputime_one_jiffy, &ts); + onecputick = ts.tv_nsec; + WARN_ON(ts.tv_sec != 0); + + return 0; +} +__initcall(init_posix_cpu_timers); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c new file mode 100644 index 0000000..424c2d4 --- /dev/null +++ b/kernel/time/posix-timers.c @@ -0,0 +1,1121 @@ +/* + * linux/kernel/posix-timers.c + * + * + * 2002-10-15 Posix Clocks & timers + * by George Anzinger george@mvista.com + * + * Copyright (C) 2002 2003 by MontaVista Software. + * + * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug. + * Copyright (C) 2004 Boris Hu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA + */ + +/* These are all the functions necessary to implement + * POSIX clocks & timers + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Management arrays for POSIX timers. Timers are now kept in static hash table + * with 512 entries. + * Timer ids are allocated by local routine, which selects proper hash head by + * key, constructed from current->signal address and per signal struct counter. + * This keeps timer ids unique per process, but now they can intersect between + * processes. + */ + +/* + * Lets keep our timers in a slab cache :-) + */ +static struct kmem_cache *posix_timers_cache; + +static DEFINE_HASHTABLE(posix_timers_hashtable, 9); +static DEFINE_SPINLOCK(hash_lock); + +/* + * we assume that the new SIGEV_THREAD_ID shares no bits with the other + * SIGEV values. Here we put out an error if this assumption fails. + */ +#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ + ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) +#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" +#endif + +/* + * parisc wants ENOTSUP instead of EOPNOTSUPP + */ +#ifndef ENOTSUP +# define ENANOSLEEP_NOTSUP EOPNOTSUPP +#else +# define ENANOSLEEP_NOTSUP ENOTSUP +#endif + +/* + * The timer ID is turned into a timer address by idr_find(). + * Verifying a valid ID consists of: + * + * a) checking that idr_find() returns other than -1. + * b) checking that the timer id matches the one in the timer itself. + * c) that the timer owner is in the callers thread group. + */ + +/* + * CLOCKs: The POSIX standard calls for a couple of clocks and allows us + * to implement others. This structure defines the various + * clocks. + * + * RESOLUTION: Clock resolution is used to round up timer and interval + * times, NOT to report clock times, which are reported with as + * much resolution as the system can muster. In some cases this + * resolution may depend on the underlying clock hardware and + * may not be quantifiable until run time, and only then is the + * necessary code is written. The standard says we should say + * something about this issue in the documentation... + * + * FUNCTIONS: The CLOCKs structure defines possible functions to + * handle various clock functions. + * + * The standard POSIX timer management code assumes the + * following: 1.) The k_itimer struct (sched.h) is used for + * the timer. 2.) The list, it_lock, it_clock, it_id and + * it_pid fields are not modified by timer code. + * + * Permissions: It is assumed that the clock_settime() function defined + * for each clock will take care of permission checks. Some + * clocks may be set able by any user (i.e. local process + * clocks) others not. Currently the only set able clock we + * have is CLOCK_REALTIME and its high res counter part, both of + * which we beg off on and pass to do_sys_settimeofday(). + */ + +static struct k_clock posix_clocks[MAX_CLOCKS]; + +/* + * These ones are defined below. + */ +static int common_nsleep(const clockid_t, int flags, struct timespec *t, + struct timespec __user *rmtp); +static int common_timer_create(struct k_itimer *new_timer); +static void common_timer_get(struct k_itimer *, struct itimerspec *); +static int common_timer_set(struct k_itimer *, int, + struct itimerspec *, struct itimerspec *); +static int common_timer_del(struct k_itimer *timer); + +static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); + +static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); + +#define lock_timer(tid, flags) \ +({ struct k_itimer *__timr; \ + __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ + __timr; \ +}) + +static int hash(struct signal_struct *sig, unsigned int nr) +{ + return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); +} + +static struct k_itimer *__posix_timers_find(struct hlist_head *head, + struct signal_struct *sig, + timer_t id) +{ + struct k_itimer *timer; + + hlist_for_each_entry_rcu(timer, head, t_hash) { + if ((timer->it_signal == sig) && (timer->it_id == id)) + return timer; + } + return NULL; +} + +static struct k_itimer *posix_timer_by_id(timer_t id) +{ + struct signal_struct *sig = current->signal; + struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + + return __posix_timers_find(head, sig, id); +} + +static int posix_timer_add(struct k_itimer *timer) +{ + struct signal_struct *sig = current->signal; + int first_free_id = sig->posix_timer_id; + struct hlist_head *head; + int ret = -ENOENT; + + do { + spin_lock(&hash_lock); + head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)]; + if (!__posix_timers_find(head, sig, sig->posix_timer_id)) { + hlist_add_head_rcu(&timer->t_hash, head); + ret = sig->posix_timer_id; + } + if (++sig->posix_timer_id < 0) + sig->posix_timer_id = 0; + if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT)) + /* Loop over all possible ids completed */ + ret = -EAGAIN; + spin_unlock(&hash_lock); + } while (ret == -ENOENT); + return ret; +} + +static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) +{ + spin_unlock_irqrestore(&timr->it_lock, flags); +} + +/* Get clock_realtime */ +static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) +{ + ktime_get_real_ts(tp); + return 0; +} + +/* Set clock_realtime */ +static int posix_clock_realtime_set(const clockid_t which_clock, + const struct timespec *tp) +{ + return do_sys_settimeofday(tp, NULL); +} + +static int posix_clock_realtime_adj(const clockid_t which_clock, + struct timex *t) +{ + return do_adjtimex(t); +} + +/* + * Get monotonic time for posix timers + */ +static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) +{ + ktime_get_ts(tp); + return 0; +} + +/* + * Get monotonic-raw time for posix timers + */ +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) +{ + getrawmonotonic(tp); + return 0; +} + + +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) +{ + *tp = current_kernel_time(); + return 0; +} + +static int posix_get_monotonic_coarse(clockid_t which_clock, + struct timespec *tp) +{ + *tp = get_monotonic_coarse(); + return 0; +} + +static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) +{ + *tp = ktime_to_timespec(KTIME_LOW_RES); + return 0; +} + +static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) +{ + get_monotonic_boottime(tp); + return 0; +} + +static int posix_get_tai(clockid_t which_clock, struct timespec *tp) +{ + timekeeping_clocktai(tp); + return 0; +} + +/* + * Initialize everything, well, just everything in Posix clocks/timers ;) + */ +static __init int init_posix_timers(void) +{ + struct k_clock clock_realtime = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_clock_realtime_get, + .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; + struct k_clock clock_monotonic = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_ktime_get_ts, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; + struct k_clock clock_monotonic_raw = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_monotonic_raw, + }; + struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, + }; + struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, + }; + struct k_clock clock_tai = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_tai, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; + struct k_clock clock_boottime = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; + + posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); + posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); + posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); + posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); + posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); + posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); + posix_timers_register_clock(CLOCK_TAI, &clock_tai); + + posix_timers_cache = kmem_cache_create("posix_timers_cache", + sizeof (struct k_itimer), 0, SLAB_PANIC, + NULL); + return 0; +} + +__initcall(init_posix_timers); + +static void schedule_next_timer(struct k_itimer *timr) +{ + struct hrtimer *timer = &timr->it.real.timer; + + if (timr->it.real.interval.tv64 == 0) + return; + + timr->it_overrun += (unsigned int) hrtimer_forward(timer, + timer->base->get_time(), + timr->it.real.interval); + + timr->it_overrun_last = timr->it_overrun; + timr->it_overrun = -1; + ++timr->it_requeue_pending; + hrtimer_restart(timer); +} + +/* + * This function is exported for use by the signal deliver code. It is + * called just prior to the info block being released and passes that + * block to us. It's function is to update the overrun entry AND to + * restart the timer. It should only be called if the timer is to be + * restarted (i.e. we have flagged this in the sys_private entry of the + * info block). + * + * To protect against the timer going away while the interrupt is queued, + * we require that the it_requeue_pending flag be set. + */ +void do_schedule_next_timer(struct siginfo *info) +{ + struct k_itimer *timr; + unsigned long flags; + + timr = lock_timer(info->si_tid, &flags); + + if (timr && timr->it_requeue_pending == info->si_sys_private) { + if (timr->it_clock < 0) + posix_cpu_timer_schedule(timr); + else + schedule_next_timer(timr); + + info->si_overrun += timr->it_overrun_last; + } + + if (timr) + unlock_timer(timr, flags); +} + +int posix_timer_event(struct k_itimer *timr, int si_private) +{ + struct task_struct *task; + int shared, ret = -1; + /* + * FIXME: if ->sigq is queued we can race with + * dequeue_signal()->do_schedule_next_timer(). + * + * If dequeue_signal() sees the "right" value of + * si_sys_private it calls do_schedule_next_timer(). + * We re-queue ->sigq and drop ->it_lock(). + * do_schedule_next_timer() locks the timer + * and re-schedules it while ->sigq is pending. + * Not really bad, but not that we want. + */ + timr->sigq->info.si_sys_private = si_private; + + rcu_read_lock(); + task = pid_task(timr->it_pid, PIDTYPE_PID); + if (task) { + shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); + ret = send_sigqueue(timr->sigq, task, shared); + } + rcu_read_unlock(); + /* If we failed to send the signal the timer stops. */ + return ret > 0; +} +EXPORT_SYMBOL_GPL(posix_timer_event); + +/* + * This function gets called when a POSIX.1b interval timer expires. It + * is used as a callback from the kernel internal timer. The + * run_timer_list code ALWAYS calls with interrupts on. + + * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. + */ +static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) +{ + struct k_itimer *timr; + unsigned long flags; + int si_private = 0; + enum hrtimer_restart ret = HRTIMER_NORESTART; + + timr = container_of(timer, struct k_itimer, it.real.timer); + spin_lock_irqsave(&timr->it_lock, flags); + + if (timr->it.real.interval.tv64 != 0) + si_private = ++timr->it_requeue_pending; + + if (posix_timer_event(timr, si_private)) { + /* + * signal was not sent because of sig_ignor + * we will not get a call back to restart it AND + * it should be restarted. + */ + if (timr->it.real.interval.tv64 != 0) { + ktime_t now = hrtimer_cb_get_time(timer); + + /* + * FIXME: What we really want, is to stop this + * timer completely and restart it in case the + * SIG_IGN is removed. This is a non trivial + * change which involves sighand locking + * (sigh !), which we don't want to do late in + * the release cycle. + * + * For now we just let timers with an interval + * less than a jiffie expire every jiffie to + * avoid softirq starvation in case of SIG_IGN + * and a very small interval, which would put + * the timer right back on the softirq pending + * list. By moving now ahead of time we trick + * hrtimer_forward() to expire the timer + * later, while we still maintain the overrun + * accuracy, but have some inconsistency in + * the timer_gettime() case. This is at least + * better than a starved softirq. A more + * complex fix which solves also another related + * inconsistency is already in the pipeline. + */ +#ifdef CONFIG_HIGH_RES_TIMERS + { + ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ); + + if (timr->it.real.interval.tv64 < kj.tv64) + now = ktime_add(now, kj); + } +#endif + timr->it_overrun += (unsigned int) + hrtimer_forward(timer, now, + timr->it.real.interval); + ret = HRTIMER_RESTART; + ++timr->it_requeue_pending; + } + } + + unlock_timer(timr, flags); + return ret; +} + +static struct pid *good_sigevent(sigevent_t * event) +{ + struct task_struct *rtn = current->group_leader; + + if ((event->sigev_notify & SIGEV_THREAD_ID ) && + (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || + !same_thread_group(rtn, current) || + (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) + return NULL; + + if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && + ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) + return NULL; + + return task_pid(rtn); +} + +void posix_timers_register_clock(const clockid_t clock_id, + struct k_clock *new_clock) +{ + if ((unsigned) clock_id >= MAX_CLOCKS) { + printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", + clock_id); + return; + } + + if (!new_clock->clock_get) { + printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", + clock_id); + return; + } + if (!new_clock->clock_getres) { + printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", + clock_id); + return; + } + + posix_clocks[clock_id] = *new_clock; +} +EXPORT_SYMBOL_GPL(posix_timers_register_clock); + +static struct k_itimer * alloc_posix_timer(void) +{ + struct k_itimer *tmr; + tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + if (!tmr) + return tmr; + if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { + kmem_cache_free(posix_timers_cache, tmr); + return NULL; + } + memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); + return tmr; +} + +static void k_itimer_rcu_free(struct rcu_head *head) +{ + struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); + + kmem_cache_free(posix_timers_cache, tmr); +} + +#define IT_ID_SET 1 +#define IT_ID_NOT_SET 0 +static void release_posix_timer(struct k_itimer *tmr, int it_id_set) +{ + if (it_id_set) { + unsigned long flags; + spin_lock_irqsave(&hash_lock, flags); + hlist_del_rcu(&tmr->t_hash); + spin_unlock_irqrestore(&hash_lock, flags); + } + put_pid(tmr->it_pid); + sigqueue_free(tmr->sigq); + call_rcu(&tmr->it.rcu, k_itimer_rcu_free); +} + +static struct k_clock *clockid_to_kclock(const clockid_t id) +{ + if (id < 0) + return (id & CLOCKFD_MASK) == CLOCKFD ? + &clock_posix_dynamic : &clock_posix_cpu; + + if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) + return NULL; + return &posix_clocks[id]; +} + +static int common_timer_create(struct k_itimer *new_timer) +{ + hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + return 0; +} + +/* Create a POSIX.1b interval timer. */ + +SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, + struct sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct k_itimer *new_timer; + int error, new_timer_id; + sigevent_t event; + int it_id_set = IT_ID_NOT_SET; + + if (!kc) + return -EINVAL; + if (!kc->timer_create) + return -EOPNOTSUPP; + + new_timer = alloc_posix_timer(); + if (unlikely(!new_timer)) + return -EAGAIN; + + spin_lock_init(&new_timer->it_lock); + new_timer_id = posix_timer_add(new_timer); + if (new_timer_id < 0) { + error = new_timer_id; + goto out; + } + + it_id_set = IT_ID_SET; + new_timer->it_id = (timer_t) new_timer_id; + new_timer->it_clock = which_clock; + new_timer->it_overrun = -1; + + if (timer_event_spec) { + if (copy_from_user(&event, timer_event_spec, sizeof (event))) { + error = -EFAULT; + goto out; + } + rcu_read_lock(); + new_timer->it_pid = get_pid(good_sigevent(&event)); + rcu_read_unlock(); + if (!new_timer->it_pid) { + error = -EINVAL; + goto out; + } + } else { + event.sigev_notify = SIGEV_SIGNAL; + event.sigev_signo = SIGALRM; + event.sigev_value.sival_int = new_timer->it_id; + new_timer->it_pid = get_pid(task_tgid(current)); + } + + new_timer->it_sigev_notify = event.sigev_notify; + new_timer->sigq->info.si_signo = event.sigev_signo; + new_timer->sigq->info.si_value = event.sigev_value; + new_timer->sigq->info.si_tid = new_timer->it_id; + new_timer->sigq->info.si_code = SI_TIMER; + + if (copy_to_user(created_timer_id, + &new_timer_id, sizeof (new_timer_id))) { + error = -EFAULT; + goto out; + } + + error = kc->timer_create(new_timer); + if (error) + goto out; + + spin_lock_irq(¤t->sighand->siglock); + new_timer->it_signal = current->signal; + list_add(&new_timer->list, ¤t->signal->posix_timers); + spin_unlock_irq(¤t->sighand->siglock); + + return 0; + /* + * In the case of the timer belonging to another task, after + * the task is unlocked, the timer is owned by the other task + * and may cease to exist at any time. Don't use or modify + * new_timer after the unlock call. + */ +out: + release_posix_timer(new_timer, it_id_set); + return error; +} + +/* + * Locking issues: We need to protect the result of the id look up until + * we get the timer locked down so it is not deleted under us. The + * removal is done under the idr spinlock so we use that here to bridge + * the find to the timer lock. To avoid a dead lock, the timer id MUST + * be release with out holding the timer lock. + */ +static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) +{ + struct k_itimer *timr; + + /* + * timer_t could be any type >= int and we want to make sure any + * @timer_id outside positive int range fails lookup. + */ + if ((unsigned long long)timer_id > INT_MAX) + return NULL; + + rcu_read_lock(); + timr = posix_timer_by_id(timer_id); + if (timr) { + spin_lock_irqsave(&timr->it_lock, *flags); + if (timr->it_signal == current->signal) { + rcu_read_unlock(); + return timr; + } + spin_unlock_irqrestore(&timr->it_lock, *flags); + } + rcu_read_unlock(); + + return NULL; +} + +/* + * Get the time remaining on a POSIX.1b interval timer. This function + * is ALWAYS called with spin_lock_irq on the timer, thus it must not + * mess with irq. + * + * We have a couple of messes to clean up here. First there is the case + * of a timer that has a requeue pending. These timers should appear to + * be in the timer list with an expiry as if we were to requeue them + * now. + * + * The second issue is the SIGEV_NONE timer which may be active but is + * not really ever put in the timer list (to save system resources). + * This timer may be expired, and if so, we will do it here. Otherwise + * it is the same as a requeue pending timer WRT to what we should + * report. + */ +static void +common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) +{ + ktime_t now, remaining, iv; + struct hrtimer *timer = &timr->it.real.timer; + + memset(cur_setting, 0, sizeof(struct itimerspec)); + + iv = timr->it.real.interval; + + /* interval timer ? */ + if (iv.tv64) + cur_setting->it_interval = ktime_to_timespec(iv); + else if (!hrtimer_active(timer) && + (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) + return; + + now = timer->base->get_time(); + + /* + * When a requeue is pending or this is a SIGEV_NONE + * timer move the expiry time forward by intervals, so + * expiry is > now. + */ + if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || + (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) + timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); + + remaining = ktime_sub(hrtimer_get_expires(timer), now); + /* Return 0 only, when the timer is expired and not pending */ + if (remaining.tv64 <= 0) { + /* + * A single shot SIGEV_NONE timer must return 0, when + * it is expired ! + */ + if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) + cur_setting->it_value.tv_nsec = 1; + } else + cur_setting->it_value = ktime_to_timespec(remaining); +} + +/* Get the time remaining on a POSIX.1b interval timer. */ +SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct itimerspec __user *, setting) +{ + struct itimerspec cur_setting; + struct k_itimer *timr; + struct k_clock *kc; + unsigned long flags; + int ret = 0; + + timr = lock_timer(timer_id, &flags); + if (!timr) + return -EINVAL; + + kc = clockid_to_kclock(timr->it_clock); + if (WARN_ON_ONCE(!kc || !kc->timer_get)) + ret = -EINVAL; + else + kc->timer_get(timr, &cur_setting); + + unlock_timer(timr, flags); + + if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + return -EFAULT; + + return ret; +} + +/* + * Get the number of overruns of a POSIX.1b interval timer. This is to + * be the overrun of the timer last delivered. At the same time we are + * accumulating overruns on the next timer. The overrun is frozen when + * the signal is delivered, either at the notify time (if the info block + * is not queued) or at the actual delivery time (as we are informed by + * the call back to do_schedule_next_timer(). So all we need to do is + * to pick up the frozen overrun. + */ +SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) +{ + struct k_itimer *timr; + int overrun; + unsigned long flags; + + timr = lock_timer(timer_id, &flags); + if (!timr) + return -EINVAL; + + overrun = timr->it_overrun_last; + unlock_timer(timr, flags); + + return overrun; +} + +/* Set a POSIX.1b interval timer. */ +/* timr->it_lock is taken. */ +static int +common_timer_set(struct k_itimer *timr, int flags, + struct itimerspec *new_setting, struct itimerspec *old_setting) +{ + struct hrtimer *timer = &timr->it.real.timer; + enum hrtimer_mode mode; + + if (old_setting) + common_timer_get(timr, old_setting); + + /* disable the timer */ + timr->it.real.interval.tv64 = 0; + /* + * careful here. If smp we could be in the "fire" routine which will + * be spinning as we hold the lock. But this is ONLY an SMP issue. + */ + if (hrtimer_try_to_cancel(timer) < 0) + return TIMER_RETRY; + + timr->it_requeue_pending = (timr->it_requeue_pending + 2) & + ~REQUEUE_PENDING; + timr->it_overrun_last = 0; + + /* switch off the timer when it_value is zero */ + if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) + return 0; + + mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; + hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); + timr->it.real.timer.function = posix_timer_fn; + + hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value)); + + /* Convert interval */ + timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); + + /* SIGEV_NONE timers are not queued ! See common_timer_get */ + if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { + /* Setup correct expiry time for relative timers */ + if (mode == HRTIMER_MODE_REL) { + hrtimer_add_expires(timer, timer->base->get_time()); + } + return 0; + } + + hrtimer_start_expires(timer, mode); + return 0; +} + +/* Set a POSIX.1b interval timer */ +SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + const struct itimerspec __user *, new_setting, + struct itimerspec __user *, old_setting) +{ + struct k_itimer *timr; + struct itimerspec new_spec, old_spec; + int error = 0; + unsigned long flag; + struct itimerspec *rtn = old_setting ? &old_spec : NULL; + struct k_clock *kc; + + if (!new_setting) + return -EINVAL; + + if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) + return -EFAULT; + + if (!timespec_valid(&new_spec.it_interval) || + !timespec_valid(&new_spec.it_value)) + return -EINVAL; +retry: + timr = lock_timer(timer_id, &flag); + if (!timr) + return -EINVAL; + + kc = clockid_to_kclock(timr->it_clock); + if (WARN_ON_ONCE(!kc || !kc->timer_set)) + error = -EINVAL; + else + error = kc->timer_set(timr, flags, &new_spec, rtn); + + unlock_timer(timr, flag); + if (error == TIMER_RETRY) { + rtn = NULL; // We already got the old time... + goto retry; + } + + if (old_setting && !error && + copy_to_user(old_setting, &old_spec, sizeof (old_spec))) + error = -EFAULT; + + return error; +} + +static int common_timer_del(struct k_itimer *timer) +{ + timer->it.real.interval.tv64 = 0; + + if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) + return TIMER_RETRY; + return 0; +} + +static inline int timer_delete_hook(struct k_itimer *timer) +{ + struct k_clock *kc = clockid_to_kclock(timer->it_clock); + + if (WARN_ON_ONCE(!kc || !kc->timer_del)) + return -EINVAL; + return kc->timer_del(timer); +} + +/* Delete a POSIX.1b interval timer. */ +SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) +{ + struct k_itimer *timer; + unsigned long flags; + +retry_delete: + timer = lock_timer(timer_id, &flags); + if (!timer) + return -EINVAL; + + if (timer_delete_hook(timer) == TIMER_RETRY) { + unlock_timer(timer, flags); + goto retry_delete; + } + + spin_lock(¤t->sighand->siglock); + list_del(&timer->list); + spin_unlock(¤t->sighand->siglock); + /* + * This keeps any tasks waiting on the spin lock from thinking + * they got something (see the lock code above). + */ + timer->it_signal = NULL; + + unlock_timer(timer, flags); + release_posix_timer(timer, IT_ID_SET); + return 0; +} + +/* + * return timer owned by the process, used by exit_itimers + */ +static void itimer_delete(struct k_itimer *timer) +{ + unsigned long flags; + +retry_delete: + spin_lock_irqsave(&timer->it_lock, flags); + + if (timer_delete_hook(timer) == TIMER_RETRY) { + unlock_timer(timer, flags); + goto retry_delete; + } + list_del(&timer->list); + /* + * This keeps any tasks waiting on the spin lock from thinking + * they got something (see the lock code above). + */ + timer->it_signal = NULL; + + unlock_timer(timer, flags); + release_posix_timer(timer, IT_ID_SET); +} + +/* + * This is called by do_exit or de_thread, only when there are no more + * references to the shared signal_struct. + */ +void exit_itimers(struct signal_struct *sig) +{ + struct k_itimer *tmr; + + while (!list_empty(&sig->posix_timers)) { + tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); + itimer_delete(tmr); + } +} + +SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, + const struct timespec __user *, tp) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec new_tp; + + if (!kc || !kc->clock_set) + return -EINVAL; + + if (copy_from_user(&new_tp, tp, sizeof (*tp))) + return -EFAULT; + + return kc->clock_set(which_clock, &new_tp); +} + +SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct timespec __user *,tp) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec kernel_tp; + int error; + + if (!kc) + return -EINVAL; + + error = kc->clock_get(which_clock, &kernel_tp); + + if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) + error = -EFAULT; + + return error; +} + +SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, + struct timex __user *, utx) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timex ktx; + int err; + + if (!kc) + return -EINVAL; + if (!kc->clock_adj) + return -EOPNOTSUPP; + + if (copy_from_user(&ktx, utx, sizeof(ktx))) + return -EFAULT; + + err = kc->clock_adj(which_clock, &ktx); + + if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx))) + return -EFAULT; + + return err; +} + +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, + struct timespec __user *, tp) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec rtn_tp; + int error; + + if (!kc) + return -EINVAL; + + error = kc->clock_getres(which_clock, &rtn_tp); + + if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) + error = -EFAULT; + + return error; +} + +/* + * nanosleep for monotonic and realtime clocks + */ +static int common_nsleep(const clockid_t which_clock, int flags, + struct timespec *tsave, struct timespec __user *rmtp) +{ + return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); +} + +SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, + const struct timespec __user *, rqtp, + struct timespec __user *, rmtp) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec t; + + if (!kc) + return -EINVAL; + if (!kc->nsleep) + return -ENANOSLEEP_NOTSUP; + + if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + return -EFAULT; + + if (!timespec_valid(&t)) + return -EINVAL; + + return kc->nsleep(which_clock, flags, &t, rmtp); +} + +/* + * This will restart clock_nanosleep. This is required only by + * compat_clock_nanosleep_restart for now. + */ +long clock_nanosleep_restart(struct restart_block *restart_block) +{ + clockid_t which_clock = restart_block->nanosleep.clockid; + struct k_clock *kc = clockid_to_kclock(which_clock); + + if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) + return -EINVAL; + + return kc->nsleep_restart(restart_block); +} diff --git a/kernel/time/time.c b/kernel/time/time.c new file mode 100644 index 0000000..7c7964c --- /dev/null +++ b/kernel/time/time.c @@ -0,0 +1,714 @@ +/* + * linux/kernel/time.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file contains the interface functions for the various + * time related system calls: time, stime, gettimeofday, settimeofday, + * adjtime + */ +/* + * Modification history kernel/time.c + * + * 1993-09-02 Philip Gladstone + * Created file with time related functions from sched/core.c and adjtimex() + * 1993-10-08 Torsten Duwe + * adjtime interface update and CMOS clock write code + * 1995-08-13 Torsten Duwe + * kernel PLL updated to 1994-12-13 specs (rfc-1589) + * 1999-01-16 Ulrich Windl + * Introduced error checking for many cases in adjtimex(). + * Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) + * (Even though the technical memorandum forbids it) + * 2004-07-14 Christoph Lameter + * Added getnstimeofday to allow the posix timer functions to return + * with nanosecond accuracy + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "timeconst.h" + +/* + * The timezone where the local system is located. Used as a default by some + * programs who obtain this value by using gettimeofday. + */ +struct timezone sys_tz; + +EXPORT_SYMBOL(sys_tz); + +#ifdef __ARCH_WANT_SYS_TIME + +/* + * sys_time() can be implemented in user-level using + * sys_gettimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ +SYSCALL_DEFINE1(time, time_t __user *, tloc) +{ + time_t i = get_seconds(); + + if (tloc) { + if (put_user(i,tloc)) + return -EFAULT; + } + force_successful_syscall_return(); + return i; +} + +/* + * sys_stime() can be implemented in user-level using + * sys_settimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ + +SYSCALL_DEFINE1(stime, time_t __user *, tptr) +{ + struct timespec tv; + int err; + + if (get_user(tv.tv_sec, tptr)) + return -EFAULT; + + tv.tv_nsec = 0; + + err = security_settime(&tv, NULL); + if (err) + return err; + + do_settimeofday(&tv); + return 0; +} + +#endif /* __ARCH_WANT_SYS_TIME */ + +SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, + struct timezone __user *, tz) +{ + if (likely(tv != NULL)) { + struct timeval ktv; + do_gettimeofday(&ktv); + if (copy_to_user(tv, &ktv, sizeof(ktv))) + return -EFAULT; + } + if (unlikely(tz != NULL)) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + return 0; +} + +/* + * Indicates if there is an offset between the system clock and the hardware + * clock/persistent clock/rtc. + */ +int persistent_clock_is_local; + +/* + * Adjust the time obtained from the CMOS to be UTC time instead of + * local time. + * + * This is ugly, but preferable to the alternatives. Otherwise we + * would either need to write a program to do it in /etc/rc (and risk + * confusion if the program gets run more than once; it would also be + * hard to make the program warp the clock precisely n hours) or + * compile in the timezone information into the kernel. Bad, bad.... + * + * - TYT, 1992-01-01 + * + * The best thing to do is to keep the CMOS clock in universal time (UTC) + * as real UNIX machines always do it. This avoids all headaches about + * daylight saving times and warping kernel clocks. + */ +static inline void warp_clock(void) +{ + if (sys_tz.tz_minuteswest != 0) { + struct timespec adjust; + + persistent_clock_is_local = 1; + adjust.tv_sec = sys_tz.tz_minuteswest * 60; + adjust.tv_nsec = 0; + timekeeping_inject_offset(&adjust); + } +} + +/* + * In case for some reason the CMOS clock has not already been running + * in UTC, but in some local time: The first time we set the timezone, + * we will warp the clock so that it is ticking UTC time instead of + * local time. Presumably, if someone is setting the timezone then we + * are running in an environment where the programs understand about + * timezones. This should be done at boot time in the /etc/rc script, + * as soon as possible, so that the clock can be set right. Otherwise, + * various programs will get confused when the clock gets warped. + */ + +int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) +{ + static int firsttime = 1; + int error = 0; + + if (tv && !timespec_valid(tv)) + return -EINVAL; + + error = security_settime(tv, tz); + if (error) + return error; + + if (tz) { + sys_tz = *tz; + update_vsyscall_tz(); + if (firsttime) { + firsttime = 0; + if (!tv) + warp_clock(); + } + } + if (tv) + return do_settimeofday(tv); + return 0; +} + +SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, + struct timezone __user *, tz) +{ + struct timeval user_tv; + struct timespec new_ts; + struct timezone new_tz; + + if (tv) { + if (copy_from_user(&user_tv, tv, sizeof(*tv))) + return -EFAULT; + new_ts.tv_sec = user_tv.tv_sec; + new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + } + if (tz) { + if (copy_from_user(&new_tz, tz, sizeof(*tz))) + return -EFAULT; + } + + return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); +} + +SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) +{ + struct timex txc; /* Local copy of parameter */ + int ret; + + /* Copy the user data space into the kernel copy + * structure. But bear in mind that the structures + * may change + */ + if(copy_from_user(&txc, txc_p, sizeof(struct timex))) + return -EFAULT; + ret = do_adjtimex(&txc); + return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; +} + +/** + * current_fs_time - Return FS time + * @sb: Superblock. + * + * Return the current time truncated to the time granularity supported by + * the fs. + */ +struct timespec current_fs_time(struct super_block *sb) +{ + struct timespec now = current_kernel_time(); + return timespec_trunc(now, sb->s_time_gran); +} +EXPORT_SYMBOL(current_fs_time); + +/* + * Convert jiffies to milliseconds and back. + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +unsigned int jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else +# if BITS_PER_LONG == 32 + return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; +# else + return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; +# endif +#endif +} +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int jiffies_to_usecs(const unsigned long j) +{ +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); +#else +# if BITS_PER_LONG == 32 + return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; +# else + return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; +# endif +#endif +} +EXPORT_SYMBOL(jiffies_to_usecs); + +/** + * timespec_trunc - Truncate timespec to a granularity + * @t: Timespec + * @gran: Granularity in ns. + * + * Truncate a timespec to a granularity. gran must be smaller than a second. + * Always rounds down. + * + * This function should be only used for timestamps returned by + * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because + * it doesn't handle the better resolution of the latter. + */ +struct timespec timespec_trunc(struct timespec t, unsigned gran) +{ + /* + * Division is pretty slow so avoid it for common cases. + * Currently current_kernel_time() never returns better than + * jiffies resolution. Exploit that. + */ + if (gran <= jiffies_to_usecs(1) * 1000) { + /* nothing */ + } else if (gran == 1000000000) { + t.tv_nsec = 0; + } else { + t.tv_nsec -= t.tv_nsec % gran; + } + return t; +} +EXPORT_SYMBOL(timespec_trunc); + +/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. + * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 + * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. + * + * [For the Julian calendar (which was used in Russia before 1917, + * Britain & colonies before 1752, anywhere else before 1582, + * and is still in use by some communities) leave out the + * -year/100+year/400 terms, and add 10.] + * + * This algorithm was first published by Gauss (I think). + * + * WARNING: this function will overflow on 2106-02-07 06:28:16 on + * machines where long is 32-bit! (However, as time_t is signed, we + * will already get problems at other places on 2038-01-19 03:14:08) + */ +unsigned long +mktime(const unsigned int year0, const unsigned int mon0, + const unsigned int day, const unsigned int hour, + const unsigned int min, const unsigned int sec) +{ + unsigned int mon = mon0, year = year0; + + /* 1..12 -> 11,12,1..10 */ + if (0 >= (int) (mon -= 2)) { + mon += 12; /* Puts Feb last since it has leap day */ + year -= 1; + } + + return ((((unsigned long) + (year/4 - year/100 + year/400 + 367*mon/12 + day) + + year*365 - 719499 + )*24 + hour /* now have hours */ + )*60 + min /* now have minutes */ + )*60 + sec; /* finally seconds */ +} + +EXPORT_SYMBOL(mktime); + +/** + * set_normalized_timespec - set timespec sec and nsec parts and normalize + * + * @ts: pointer to timespec variable to be set + * @sec: seconds to set + * @nsec: nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of + * 0 <= tv_nsec < NSEC_PER_SEC + * For negative values only the tv_sec field is negative ! + */ +void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec) +{ + while (nsec >= NSEC_PER_SEC) { + /* + * The following asm() prevents the compiler from + * optimising this loop into a modulo operation. See + * also __iter_div_u64_rem() in include/linux/time.h + */ + asm("" : "+rm"(nsec)); + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + asm("" : "+rm"(nsec)); + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} +EXPORT_SYMBOL(set_normalized_timespec); + +/** + * ns_to_timespec - Convert nanoseconds to timespec + * @nsec: the nanoseconds value to be converted + * + * Returns the timespec representation of the nsec parameter. + */ +struct timespec ns_to_timespec(const s64 nsec) +{ + struct timespec ts; + s32 rem; + + if (!nsec) + return (struct timespec) {0, 0}; + + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; + + return ts; +} +EXPORT_SYMBOL(ns_to_timespec); + +/** + * ns_to_timeval - Convert nanoseconds to timeval + * @nsec: the nanoseconds value to be converted + * + * Returns the timeval representation of the nsec parameter. + */ +struct timeval ns_to_timeval(const s64 nsec) +{ + struct timespec ts = ns_to_timespec(nsec); + struct timeval tv; + + tv.tv_sec = ts.tv_sec; + tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; + + return tv; +} +EXPORT_SYMBOL(ns_to_timeval); + +/* + * When we convert to jiffies then we interpret incoming values + * the following way: + * + * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor + * + * We must also be careful about 32-bit overflows. + */ +unsigned long msecs_to_jiffies(const unsigned int m) +{ + /* + * Negative value, means infinite timeout: + */ + if ((int)m < 0) + return MAX_JIFFY_OFFSET; + +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + /* + * HZ is equal to or smaller than 1000, and 1000 is a nice + * round multiple of HZ, divide with the factor between them, + * but round upwards: + */ + return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + /* + * HZ is larger than 1000, and HZ is a nice round multiple of + * 1000 - simply multiply with the factor between them. + * + * But first make sure the multiplication result cannot + * overflow: + */ + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return m * (HZ / MSEC_PER_SEC); +#else + /* + * Generic case - multiply, round and divide. But first + * check that if we are doing a net multiplication, that + * we wouldn't overflow: + */ + if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) + >> MSEC_TO_HZ_SHR32; +#endif +} +EXPORT_SYMBOL(msecs_to_jiffies); + +unsigned long usecs_to_jiffies(const unsigned int u) +{ + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return u * (HZ / USEC_PER_SEC); +#else + return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) + >> USEC_TO_HZ_SHR32; +#endif +} +EXPORT_SYMBOL(usecs_to_jiffies); + +/* + * The TICK_NSEC - 1 rounds up the value to the next resolution. Note + * that a remainder subtract here would not do the right thing as the + * resolution values don't fall on second boundries. I.e. the line: + * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * + * Rather, we just shift the bits off the right. + * + * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec + * value to a scaled second value. + */ +unsigned long +timespec_to_jiffies(const struct timespec *value) +{ + unsigned long sec = value->tv_sec; + long nsec = value->tv_nsec + TICK_NSEC - 1; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + nsec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)nsec * NSEC_CONVERSION) >> + (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + +} +EXPORT_SYMBOL(timespec_to_jiffies); + +void +jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u32 rem; + value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, + NSEC_PER_SEC, &rem); + value->tv_nsec = rem; +} +EXPORT_SYMBOL(jiffies_to_timespec); + +/* Same for "timeval" + * + * Well, almost. The problem here is that the real system resolution is + * in nanoseconds and the value being converted is in micro seconds. + * Also for some machines (those that use HZ = 1024, in-particular), + * there is a LARGE error in the tick size in microseconds. + + * The solution we use is to do the rounding AFTER we convert the + * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. + * Instruction wise, this should cost only an additional add with carry + * instruction above the way it was done above. + */ +unsigned long +timeval_to_jiffies(const struct timeval *value) +{ + unsigned long sec = value->tv_sec; + long usec = value->tv_usec; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + usec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> + (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; +} +EXPORT_SYMBOL(timeval_to_jiffies); + +void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u32 rem; + + value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, + NSEC_PER_SEC, &rem); + value->tv_usec = rem / NSEC_PER_USEC; +} +EXPORT_SYMBOL(jiffies_to_timeval); + +/* + * Convert jiffies/jiffies_64 to clock_t and back. + */ +clock_t jiffies_to_clock_t(unsigned long x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 +# if HZ < USER_HZ + return x * (USER_HZ / HZ); +# else + return x / (HZ / USER_HZ); +# endif +#else + return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); +#endif +} +EXPORT_SYMBOL(jiffies_to_clock_t); + +unsigned long clock_t_to_jiffies(unsigned long x) +{ +#if (HZ % USER_HZ)==0 + if (x >= ~0UL / (HZ / USER_HZ)) + return ~0UL; + return x * (HZ / USER_HZ); +#else + /* Don't worry about loss of precision here .. */ + if (x >= ~0UL / HZ * USER_HZ) + return ~0UL; + + /* .. but do try to contain it here */ + return div_u64((u64)x * HZ, USER_HZ); +#endif +} +EXPORT_SYMBOL(clock_t_to_jiffies); + +u64 jiffies_64_to_clock_t(u64 x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 +# if HZ < USER_HZ + x = div_u64(x * USER_HZ, HZ); +# elif HZ > USER_HZ + x = div_u64(x, HZ / USER_HZ); +# else + /* Nothing to do */ +# endif +#else + /* + * There are better ways that don't overflow early, + * but even this doesn't overflow in hundreds of years + * in 64 bits, so.. + */ + x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); +#endif + return x; +} +EXPORT_SYMBOL(jiffies_64_to_clock_t); + +u64 nsec_to_clock_t(u64 x) +{ +#if (NSEC_PER_SEC % USER_HZ) == 0 + return div_u64(x, NSEC_PER_SEC / USER_HZ); +#elif (USER_HZ % 512) == 0 + return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); +#else + /* + * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, + * overflow after 64.99 years. + * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... + */ + return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); +#endif +} + +/** + * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +u64 nsecs_to_jiffies64(u64 n) +{ +#if (NSEC_PER_SEC % HZ) == 0 + /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ + return div_u64(n, NSEC_PER_SEC / HZ); +#elif (HZ % 512) == 0 + /* overflow after 292 years if HZ = 1024 */ + return div_u64(n * HZ / 512, NSEC_PER_SEC / 512); +#else + /* + * Generic case - optimized for cases where HZ is a multiple of 3. + * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc. + */ + return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); +#endif +} + +/** + * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +unsigned long nsecs_to_jiffies(u64 n) +{ + return (unsigned long)nsecs_to_jiffies64(n); +} + +/* + * Add two timespec values and do a safety check for overflow. + * It's assumed that both values are valid (>= 0) + */ +struct timespec timespec_add_safe(const struct timespec lhs, + const struct timespec rhs) +{ + struct timespec res; + + set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + + if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) + res.tv_sec = TIME_T_MAX; + + return res; +} diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc new file mode 100644 index 0000000..511bdf2 --- /dev/null +++ b/kernel/time/timeconst.bc @@ -0,0 +1,108 @@ +scale=0 + +define gcd(a,b) { + auto t; + while (b) { + t = b; + b = a % b; + a = t; + } + return a; +} + +/* Division by reciprocal multiplication. */ +define fmul(b,n,d) { + return (2^b*n+d-1)/d; +} + +/* Adjustment factor when a ceiling value is used. Use as: + (imul * n) + (fmulxx * n + fadjxx) >> xx) */ +define fadj(b,n,d) { + auto v; + d = d/gcd(n,d); + v = 2^b*(d-1)/d; + return v; +} + +/* Compute the appropriate mul/adj values as well as a shift count, + which brings the mul value into the range 2^b-1 <= x < 2^b. Such + a shift value will be correct in the signed integer range and off + by at most one in the upper half of the unsigned range. */ +define fmuls(b,n,d) { + auto s, m; + for (s = 0; 1; s++) { + m = fmul(s,n,d); + if (m >= 2^(b-1)) + return s; + } + return 0; +} + +define timeconst(hz) { + print "/* Automatically generated by kernel/timeconst.bc */\n" + print "/* Time conversion constants for HZ == ", hz, " */\n" + print "\n" + + print "#ifndef KERNEL_TIMECONST_H\n" + print "#define KERNEL_TIMECONST_H\n\n" + + print "#include \n" + print "#include \n\n" + + print "#if HZ != ", hz, "\n" + print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n" + print "#endif\n\n" + + if (hz < 2) { + print "#error Totally bogus HZ value!\n" + } else { + s=fmuls(32,1000,hz) + obase=16 + print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n" + print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n" + obase=10 + print "#define HZ_TO_MSEC_SHR32\t", s, "\n" + + s=fmuls(32,hz,1000) + obase=16 + print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n" + print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n" + obase=10 + print "#define MSEC_TO_HZ_SHR32\t", s, "\n" + + obase=10 + cd=gcd(hz,1000) + print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n" + print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n" + print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n" + print "\n" + + s=fmuls(32,1000000,hz) + obase=16 + print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n" + print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n" + obase=10 + print "#define HZ_TO_USEC_SHR32\t", s, "\n" + + s=fmuls(32,hz,1000000) + obase=16 + print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n" + print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n" + obase=10 + print "#define USEC_TO_HZ_SHR32\t", s, "\n" + + obase=10 + cd=gcd(hz,1000000) + print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n" + print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n" + print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n" + print "\n" + + print "#endif /* KERNEL_TIMECONST_H */\n" + } + halt +} + +timeconst(hz) diff --git a/kernel/time/timer.c b/kernel/time/timer.c new file mode 100644 index 0000000..3bb01a3 --- /dev/null +++ b/kernel/time/timer.c @@ -0,0 +1,1734 @@ +/* + * linux/kernel/timer.c + * + * Kernel internal timers + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. + * + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to + * serialize accesses to xtime/lost_ticks). + * Copyright (C) 1998 Andrea Arcangeli + * 1999-03-10 Improved NTP compatibility by Ulrich Windl + * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love + * 2000-10-05 Implemented scalable SMP per-CPU timer handling. + * Copyright (C) 2000, 2001, 2002 Ingo Molnar + * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; + +EXPORT_SYMBOL(jiffies_64); + +/* + * per-CPU timer vector definitions: + */ +#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) +#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) +#define TVN_SIZE (1 << TVN_BITS) +#define TVR_SIZE (1 << TVR_BITS) +#define TVN_MASK (TVN_SIZE - 1) +#define TVR_MASK (TVR_SIZE - 1) +#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) + +struct tvec { + struct list_head vec[TVN_SIZE]; +}; + +struct tvec_root { + struct list_head vec[TVR_SIZE]; +}; + +struct tvec_base { + spinlock_t lock; + struct timer_list *running_timer; + unsigned long timer_jiffies; + unsigned long next_timer; + unsigned long active_timers; + unsigned long all_timers; + struct tvec_root tv1; + struct tvec tv2; + struct tvec tv3; + struct tvec tv4; + struct tvec tv5; +} ____cacheline_aligned; + +struct tvec_base boot_tvec_bases; +EXPORT_SYMBOL(boot_tvec_bases); +static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; + +/* Functions below help us manage 'deferrable' flag */ +static inline unsigned int tbase_get_deferrable(struct tvec_base *base) +{ + return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE); +} + +static inline unsigned int tbase_get_irqsafe(struct tvec_base *base) +{ + return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE); +} + +static inline struct tvec_base *tbase_get_base(struct tvec_base *base) +{ + return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK)); +} + +static inline void +timer_set_base(struct timer_list *timer, struct tvec_base *new_base) +{ + unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK; + + timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags); +} + +static unsigned long round_jiffies_common(unsigned long j, int cpu, + bool force_up) +{ + int rem; + unsigned long original = j; + + /* + * We don't want all cpus firing their timers at once hitting the + * same lock or cachelines, so we skew each extra cpu with an extra + * 3 jiffies. This 3 jiffies came originally from the mm/ code which + * already did this. + * The skew is done by adding 3*cpunr, then round, then subtract this + * extra offset again. + */ + j += cpu * 3; + + rem = j % HZ; + + /* + * If the target jiffie is just after a whole second (which can happen + * due to delays of the timer irq, long irq off times etc etc) then + * we should round down to the whole second, not up. Use 1/4th second + * as cutoff for this rounding as an extreme upper bound for this. + * But never round down if @force_up is set. + */ + if (rem < HZ/4 && !force_up) /* round down */ + j = j - rem; + else /* round up */ + j = j - rem + HZ; + + /* now that we have rounded, subtract the extra skew again */ + j -= cpu * 3; + + /* + * Make sure j is still in the future. Otherwise return the + * unmodified value. + */ + return time_is_after_jiffies(j) ? j : original; +} + +/** + * __round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, false); +} +EXPORT_SYMBOL_GPL(__round_jiffies); + +/** + * __round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies_relative() rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies_relative(unsigned long j, int cpu) +{ + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, false) - j0; +} +EXPORT_SYMBOL_GPL(__round_jiffies_relative); + +/** + * round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long round_jiffies(unsigned long j) +{ + return round_jiffies_common(j, raw_smp_processor_id(), false); +} +EXPORT_SYMBOL_GPL(round_jiffies); + +/** + * round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * round_jiffies_relative() rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long round_jiffies_relative(unsigned long j) +{ + return __round_jiffies_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_relative); + +/** + * __round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, true); +} +EXPORT_SYMBOL_GPL(__round_jiffies_up); + +/** + * __round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) +{ + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, true) - j0; +} +EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); + +/** + * round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * This is the same as round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up(unsigned long j) +{ + return round_jiffies_common(j, raw_smp_processor_id(), true); +} +EXPORT_SYMBOL_GPL(round_jiffies_up); + +/** + * round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * This is the same as round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up_relative(unsigned long j) +{ + return __round_jiffies_up_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_up_relative); + +/** + * set_timer_slack - set the allowed slack for a timer + * @timer: the timer to be modified + * @slack_hz: the amount of time (in jiffies) allowed for rounding + * + * Set the amount of time, in jiffies, that a certain timer has + * in terms of slack. By setting this value, the timer subsystem + * will schedule the actual timer somewhere between + * the time mod_timer() asks for, and that time plus the slack. + * + * By setting the slack to -1, a percentage of the delay is used + * instead. + */ +void set_timer_slack(struct timer_list *timer, int slack_hz) +{ + timer->slack = slack_hz; +} +EXPORT_SYMBOL_GPL(set_timer_slack); + +/* + * If the list is empty, catch up ->timer_jiffies to the current time. + * The caller must hold the tvec_base lock. Returns true if the list + * was empty and therefore ->timer_jiffies was updated. + */ +static bool catchup_timer_jiffies(struct tvec_base *base) +{ + if (!base->all_timers) { + base->timer_jiffies = jiffies; + return true; + } + return false; +} + +static void +__internal_add_timer(struct tvec_base *base, struct timer_list *timer) +{ + unsigned long expires = timer->expires; + unsigned long idx = expires - base->timer_jiffies; + struct list_head *vec; + + if (idx < TVR_SIZE) { + int i = expires & TVR_MASK; + vec = base->tv1.vec + i; + } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { + int i = (expires >> TVR_BITS) & TVN_MASK; + vec = base->tv2.vec + i; + } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; + vec = base->tv3.vec + i; + } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; + vec = base->tv4.vec + i; + } else if ((signed long) idx < 0) { + /* + * Can happen if you add a timer with expires == jiffies, + * or you set a timer to go off in the past + */ + vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); + } else { + int i; + /* If the timeout is larger than MAX_TVAL (on 64-bit + * architectures or with CONFIG_BASE_SMALL=1) then we + * use the maximum timeout. + */ + if (idx > MAX_TVAL) { + idx = MAX_TVAL; + expires = idx + base->timer_jiffies; + } + i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; + vec = base->tv5.vec + i; + } + /* + * Timers are FIFO: + */ + list_add_tail(&timer->entry, vec); +} + +static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +{ + (void)catchup_timer_jiffies(base); + __internal_add_timer(base, timer); + /* + * Update base->active_timers and base->next_timer + */ + if (!tbase_get_deferrable(timer->base)) { + if (!base->active_timers++ || + time_before(timer->expires, base->next_timer)) + base->next_timer = timer->expires; + } + base->all_timers++; +} + +#ifdef CONFIG_TIMER_STATS +void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) +{ + if (timer->start_site) + return; + + timer->start_site = addr; + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +} + +static void timer_stats_account_timer(struct timer_list *timer) +{ + unsigned int flag = 0; + + if (likely(!timer->start_site)) + return; + if (unlikely(tbase_get_deferrable(timer->base))) + flag |= TIMER_STATS_FLAG_DEFERRABLE; + + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm, flag); +} + +#else +static void timer_stats_account_timer(struct timer_list *timer) {} +#endif + +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS + +static struct debug_obj_descr timer_debug_descr; + +static void *timer_debug_hint(void *addr) +{ + return ((struct timer_list *) addr)->function; +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int timer_fixup_init(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + del_timer_sync(timer); + debug_object_init(timer, &timer_debug_descr); + return 1; + default: + return 0; + } +} + +/* Stub timer callback for improperly used timers. */ +static void stub_timer(unsigned long data) +{ + WARN_ON(1); +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + */ +static int timer_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. The timer was + * statically initialized. We just make sure that it + * is tracked in the object tracker. + */ + if (timer->entry.next == NULL && + timer->entry.prev == TIMER_ENTRY_STATIC) { + debug_object_init(timer, &timer_debug_descr); + debug_object_activate(timer, &timer_debug_descr); + return 0; + } else { + setup_timer(timer, stub_timer, 0); + return 1; + } + return 0; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int timer_fixup_free(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + del_timer_sync(timer); + debug_object_free(timer, &timer_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_assert_init is called when: + * - an untracked/uninit-ed object is found + */ +static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_NOTAVAILABLE: + if (timer->entry.prev == TIMER_ENTRY_STATIC) { + /* + * This is not really a fixup. The timer was + * statically initialized. We just make sure that it + * is tracked in the object tracker. + */ + debug_object_init(timer, &timer_debug_descr); + return 0; + } else { + setup_timer(timer, stub_timer, 0); + return 1; + } + default: + return 0; + } +} + +static struct debug_obj_descr timer_debug_descr = { + .name = "timer_list", + .debug_hint = timer_debug_hint, + .fixup_init = timer_fixup_init, + .fixup_activate = timer_fixup_activate, + .fixup_free = timer_fixup_free, + .fixup_assert_init = timer_fixup_assert_init, +}; + +static inline void debug_timer_init(struct timer_list *timer) +{ + debug_object_init(timer, &timer_debug_descr); +} + +static inline void debug_timer_activate(struct timer_list *timer) +{ + debug_object_activate(timer, &timer_debug_descr); +} + +static inline void debug_timer_deactivate(struct timer_list *timer) +{ + debug_object_deactivate(timer, &timer_debug_descr); +} + +static inline void debug_timer_free(struct timer_list *timer) +{ + debug_object_free(timer, &timer_debug_descr); +} + +static inline void debug_timer_assert_init(struct timer_list *timer) +{ + debug_object_assert_init(timer, &timer_debug_descr); +} + +static void do_init_timer(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key); + +void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key) +{ + debug_object_init_on_stack(timer, &timer_debug_descr); + do_init_timer(timer, flags, name, key); +} +EXPORT_SYMBOL_GPL(init_timer_on_stack_key); + +void destroy_timer_on_stack(struct timer_list *timer) +{ + debug_object_free(timer, &timer_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_timer_on_stack); + +#else +static inline void debug_timer_init(struct timer_list *timer) { } +static inline void debug_timer_activate(struct timer_list *timer) { } +static inline void debug_timer_deactivate(struct timer_list *timer) { } +static inline void debug_timer_assert_init(struct timer_list *timer) { } +#endif + +static inline void debug_init(struct timer_list *timer) +{ + debug_timer_init(timer); + trace_timer_init(timer); +} + +static inline void +debug_activate(struct timer_list *timer, unsigned long expires) +{ + debug_timer_activate(timer); + trace_timer_start(timer, expires); +} + +static inline void debug_deactivate(struct timer_list *timer) +{ + debug_timer_deactivate(timer); + trace_timer_cancel(timer); +} + +static inline void debug_assert_init(struct timer_list *timer) +{ + debug_timer_assert_init(timer); +} + +static void do_init_timer(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key) +{ + struct tvec_base *base = __raw_get_cpu_var(tvec_bases); + + timer->entry.next = NULL; + timer->base = (void *)((unsigned long)base | flags); + timer->slack = -1; +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif + lockdep_init_map(&timer->lockdep_map, name, key, 0); +} + +/** + * init_timer_key - initialize a timer + * @timer: the timer to be initialized + * @flags: timer flags + * @name: name of the timer + * @key: lockdep class key of the fake lock used for tracking timer + * sync lock dependencies + * + * init_timer_key() must be done to a timer prior calling *any* of the + * other timer functions. + */ +void init_timer_key(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key) +{ + debug_init(timer); + do_init_timer(timer, flags, name, key); +} +EXPORT_SYMBOL(init_timer_key); + +static inline void detach_timer(struct timer_list *timer, bool clear_pending) +{ + struct list_head *entry = &timer->entry; + + debug_deactivate(timer); + + __list_del(entry->prev, entry->next); + if (clear_pending) + entry->next = NULL; + entry->prev = LIST_POISON2; +} + +static inline void +detach_expired_timer(struct timer_list *timer, struct tvec_base *base) +{ + detach_timer(timer, true); + if (!tbase_get_deferrable(timer->base)) + base->active_timers--; + base->all_timers--; + (void)catchup_timer_jiffies(base); +} + +static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, + bool clear_pending) +{ + if (!timer_pending(timer)) + return 0; + + detach_timer(timer, clear_pending); + if (!tbase_get_deferrable(timer->base)) { + base->active_timers--; + if (timer->expires == base->next_timer) + base->next_timer = base->timer_jiffies; + } + base->all_timers--; + (void)catchup_timer_jiffies(base); + return 1; +} + +/* + * We are using hashed locking: holding per_cpu(tvec_bases).lock + * means that all timers which are tied to this base via timer->base are + * locked, and the base itself is locked too. + * + * So __run_timers/migrate_timers can safely modify all timers which could + * be found on ->tvX lists. + * + * When the timer's base is locked, and the timer removed from list, it is + * possible to set timer->base = NULL and drop the lock: the timer remains + * locked. + */ +static struct tvec_base *lock_timer_base(struct timer_list *timer, + unsigned long *flags) + __acquires(timer->base->lock) +{ + struct tvec_base *base; + + for (;;) { + struct tvec_base *prelock_base = timer->base; + base = tbase_get_base(prelock_base); + if (likely(base != NULL)) { + spin_lock_irqsave(&base->lock, *flags); + if (likely(prelock_base == timer->base)) + return base; + /* The timer has migrated to another CPU */ + spin_unlock_irqrestore(&base->lock, *flags); + } + cpu_relax(); + } +} + +static inline int +__mod_timer(struct timer_list *timer, unsigned long expires, + bool pending_only, int pinned) +{ + struct tvec_base *base, *new_base; + unsigned long flags; + int ret = 0 , cpu; + + timer_stats_timer_set_start_info(timer); + BUG_ON(!timer->function); + + base = lock_timer_base(timer, &flags); + + ret = detach_if_pending(timer, base, false); + if (!ret && pending_only) + goto out_unlock; + + debug_activate(timer, expires); + + cpu = get_nohz_timer_target(pinned); + new_base = per_cpu(tvec_bases, cpu); + + if (base != new_base) { + /* + * We are trying to schedule the timer on the local CPU. + * However we can't change timer's base while it is running, + * otherwise del_timer_sync() can't detect that the timer's + * handler yet has not finished. This also guarantees that + * the timer is serialized wrt itself. + */ + if (likely(base->running_timer != timer)) { + /* See the comment in lock_timer_base() */ + timer_set_base(timer, NULL); + spin_unlock(&base->lock); + base = new_base; + spin_lock(&base->lock); + timer_set_base(timer, base); + } + } + + timer->expires = expires; + internal_add_timer(base, timer); + +out_unlock: + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + +/** + * mod_timer_pending - modify a pending timer's timeout + * @timer: the pending timer to be modified + * @expires: new timeout in jiffies + * + * mod_timer_pending() is the same for pending timers as mod_timer(), + * but will not re-activate and modify already deleted timers. + * + * It is useful for unserialized use of timers. + */ +int mod_timer_pending(struct timer_list *timer, unsigned long expires) +{ + return __mod_timer(timer, expires, true, TIMER_NOT_PINNED); +} +EXPORT_SYMBOL(mod_timer_pending); + +/* + * Decide where to put the timer while taking the slack into account + * + * Algorithm: + * 1) calculate the maximum (absolute) time + * 2) calculate the highest bit where the expires and new max are different + * 3) use this bit to make a mask + * 4) use the bitmask to round down the maximum time, so that all last + * bits are zeros + */ +static inline +unsigned long apply_slack(struct timer_list *timer, unsigned long expires) +{ + unsigned long expires_limit, mask; + int bit; + + if (timer->slack >= 0) { + expires_limit = expires + timer->slack; + } else { + long delta = expires - jiffies; + + if (delta < 256) + return expires; + + expires_limit = expires + delta / 256; + } + mask = expires ^ expires_limit; + if (mask == 0) + return expires; + + bit = find_last_bit(&mask, BITS_PER_LONG); + + mask = (1UL << bit) - 1; + + expires_limit = expires_limit & ~(mask); + + return expires_limit; +} + +/** + * mod_timer - modify a timer's timeout + * @timer: the timer to be modified + * @expires: new timeout in jiffies + * + * mod_timer() is a more efficient way to update the expire field of an + * active timer (if the timer is inactive it will be activated) + * + * mod_timer(timer, expires) is equivalent to: + * + * del_timer(timer); timer->expires = expires; add_timer(timer); + * + * Note that if there are multiple unserialized concurrent users of the + * same timer, then mod_timer() is the only safe way to modify the timeout, + * since add_timer() cannot modify an already running timer. + * + * The function returns whether it has modified a pending timer or not. + * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an + * active timer returns 1.) + */ +int mod_timer(struct timer_list *timer, unsigned long expires) +{ + expires = apply_slack(timer, expires); + + /* + * This is a common optimization triggered by the + * networking code - if the timer is re-modified + * to be the same thing then just return: + */ + if (timer_pending(timer) && timer->expires == expires) + return 1; + + return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); +} +EXPORT_SYMBOL(mod_timer); + +/** + * mod_timer_pinned - modify a timer's timeout + * @timer: the timer to be modified + * @expires: new timeout in jiffies + * + * mod_timer_pinned() is a way to update the expire field of an + * active timer (if the timer is inactive it will be activated) + * and to ensure that the timer is scheduled on the current CPU. + * + * Note that this does not prevent the timer from being migrated + * when the current CPU goes offline. If this is a problem for + * you, use CPU-hotplug notifiers to handle it correctly, for + * example, cancelling the timer when the corresponding CPU goes + * offline. + * + * mod_timer_pinned(timer, expires) is equivalent to: + * + * del_timer(timer); timer->expires = expires; add_timer(timer); + */ +int mod_timer_pinned(struct timer_list *timer, unsigned long expires) +{ + if (timer->expires == expires && timer_pending(timer)) + return 1; + + return __mod_timer(timer, expires, false, TIMER_PINNED); +} +EXPORT_SYMBOL(mod_timer_pinned); + +/** + * add_timer - start a timer + * @timer: the timer to be added + * + * The kernel will do a ->function(->data) callback from the + * timer interrupt at the ->expires point in the future. The + * current time is 'jiffies'. + * + * The timer's ->expires, ->function (and if the handler uses it, ->data) + * fields must be set prior calling this function. + * + * Timers with an ->expires field in the past will be executed in the next + * timer tick. + */ +void add_timer(struct timer_list *timer) +{ + BUG_ON(timer_pending(timer)); + mod_timer(timer, timer->expires); +} +EXPORT_SYMBOL(add_timer); + +/** + * add_timer_on - start a timer on a particular CPU + * @timer: the timer to be added + * @cpu: the CPU to start it on + * + * This is not very scalable on SMP. Double adds are not possible. + */ +void add_timer_on(struct timer_list *timer, int cpu) +{ + struct tvec_base *base = per_cpu(tvec_bases, cpu); + unsigned long flags; + + timer_stats_timer_set_start_info(timer); + BUG_ON(timer_pending(timer) || !timer->function); + spin_lock_irqsave(&base->lock, flags); + timer_set_base(timer, base); + debug_activate(timer, timer->expires); + internal_add_timer(base, timer); + /* + * Check whether the other CPU is in dynticks mode and needs + * to be triggered to reevaluate the timer wheel. + * We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to stop its tick can not + * evaluate the timer wheel. + * + * Spare the IPI for deferrable timers on idle targets though. + * The next busy ticks will take care of it. Except full dynticks + * require special care against races with idle_cpu(), lets deal + * with that later. + */ + if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu)) + wake_up_nohz_cpu(cpu); + + spin_unlock_irqrestore(&base->lock, flags); +} +EXPORT_SYMBOL_GPL(add_timer_on); + +/** + * del_timer - deactive a timer. + * @timer: the timer to be deactivated + * + * del_timer() deactivates a timer - this works on both active and inactive + * timers. + * + * The function returns whether it has deactivated a pending timer or not. + * (ie. del_timer() of an inactive timer returns 0, del_timer() of an + * active timer returns 1.) + */ +int del_timer(struct timer_list *timer) +{ + struct tvec_base *base; + unsigned long flags; + int ret = 0; + + debug_assert_init(timer); + + timer_stats_timer_clear_start_info(timer); + if (timer_pending(timer)) { + base = lock_timer_base(timer, &flags); + ret = detach_if_pending(timer, base, true); + spin_unlock_irqrestore(&base->lock, flags); + } + + return ret; +} +EXPORT_SYMBOL(del_timer); + +/** + * try_to_del_timer_sync - Try to deactivate a timer + * @timer: timer do del + * + * This function tries to deactivate a timer. Upon successful (ret >= 0) + * exit the timer is not queued and the handler is not running on any CPU. + */ +int try_to_del_timer_sync(struct timer_list *timer) +{ + struct tvec_base *base; + unsigned long flags; + int ret = -1; + + debug_assert_init(timer); + + base = lock_timer_base(timer, &flags); + + if (base->running_timer != timer) { + timer_stats_timer_clear_start_info(timer); + ret = detach_if_pending(timer, base, true); + } + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} +EXPORT_SYMBOL(try_to_del_timer_sync); + +#ifdef CONFIG_SMP +/** + * del_timer_sync - deactivate a timer and wait for the handler to finish. + * @timer: the timer to be deactivated + * + * This function only differs from del_timer() on SMP: besides deactivating + * the timer it also makes sure the handler has finished executing on other + * CPUs. + * + * Synchronization rules: Callers must prevent restarting of the timer, + * otherwise this function is meaningless. It must not be called from + * interrupt contexts unless the timer is an irqsafe one. The caller must + * not hold locks which would prevent completion of the timer's + * handler. The timer's handler must not call add_timer_on(). Upon exit the + * timer is not queued and the handler is not running on any CPU. + * + * Note: For !irqsafe timers, you must not hold locks that are held in + * interrupt context while calling this function. Even if the lock has + * nothing to do with the timer in question. Here's why: + * + * CPU0 CPU1 + * ---- ---- + * + * call_timer_fn(); + * base->running_timer = mytimer; + * spin_lock_irq(somelock); + * + * spin_lock(somelock); + * del_timer_sync(mytimer); + * while (base->running_timer == mytimer); + * + * Now del_timer_sync() will never return and never release somelock. + * The interrupt on the other CPU is waiting to grab somelock but + * it has interrupted the softirq that CPU0 is waiting to finish. + * + * The function returns whether it has deactivated a pending timer or not. + */ +int del_timer_sync(struct timer_list *timer) +{ +#ifdef CONFIG_LOCKDEP + unsigned long flags; + + /* + * If lockdep gives a backtrace here, please reference + * the synchronization rules above. + */ + local_irq_save(flags); + lock_map_acquire(&timer->lockdep_map); + lock_map_release(&timer->lockdep_map); + local_irq_restore(flags); +#endif + /* + * don't use it in hardirq context, because it + * could lead to deadlock. + */ + WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base)); + for (;;) { + int ret = try_to_del_timer_sync(timer); + if (ret >= 0) + return ret; + cpu_relax(); + } +} +EXPORT_SYMBOL(del_timer_sync); +#endif + +static int cascade(struct tvec_base *base, struct tvec *tv, int index) +{ + /* cascade all the timers from tv up one level */ + struct timer_list *timer, *tmp; + struct list_head tv_list; + + list_replace_init(tv->vec + index, &tv_list); + + /* + * We are removing _all_ timers from the list, so we + * don't have to detach them individually. + */ + list_for_each_entry_safe(timer, tmp, &tv_list, entry) { + BUG_ON(tbase_get_base(timer->base) != base); + /* No accounting, while moving them */ + __internal_add_timer(base, timer); + } + + return index; +} + +static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), + unsigned long data) +{ + int count = preempt_count(); + +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the timer from inside the + * function that is called from it, this we need to take into + * account for lockdep too. To avoid bogus "held lock freed" + * warnings as well as problems when looking into + * timer->lockdep_map, make a copy and use that here. + */ + struct lockdep_map lockdep_map; + + lockdep_copy_map(&lockdep_map, &timer->lockdep_map); +#endif + /* + * Couple the lock chain with the lock chain at + * del_timer_sync() by acquiring the lock_map around the fn() + * call here and in del_timer_sync(). + */ + lock_map_acquire(&lockdep_map); + + trace_timer_expire_entry(timer); + fn(data); + trace_timer_expire_exit(timer); + + lock_map_release(&lockdep_map); + + if (count != preempt_count()) { + WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", + fn, count, preempt_count()); + /* + * Restore the preempt count. That gives us a decent + * chance to survive and extract information. If the + * callback kept a lock held, bad luck, but not worse + * than the BUG() we had. + */ + preempt_count_set(count); + } +} + +#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) + +/** + * __run_timers - run all expired timers (if any) on this CPU. + * @base: the timer vector to be processed. + * + * This function cascades all vectors and executes all expired timer + * vectors. + */ +static inline void __run_timers(struct tvec_base *base) +{ + struct timer_list *timer; + + spin_lock_irq(&base->lock); + if (catchup_timer_jiffies(base)) { + spin_unlock_irq(&base->lock); + return; + } + while (time_after_eq(jiffies, base->timer_jiffies)) { + struct list_head work_list; + struct list_head *head = &work_list; + int index = base->timer_jiffies & TVR_MASK; + + /* + * Cascade timers: + */ + if (!index && + (!cascade(base, &base->tv2, INDEX(0))) && + (!cascade(base, &base->tv3, INDEX(1))) && + !cascade(base, &base->tv4, INDEX(2))) + cascade(base, &base->tv5, INDEX(3)); + ++base->timer_jiffies; + list_replace_init(base->tv1.vec + index, head); + while (!list_empty(head)) { + void (*fn)(unsigned long); + unsigned long data; + bool irqsafe; + + timer = list_first_entry(head, struct timer_list,entry); + fn = timer->function; + data = timer->data; + irqsafe = tbase_get_irqsafe(timer->base); + + timer_stats_account_timer(timer); + + base->running_timer = timer; + detach_expired_timer(timer, base); + + if (irqsafe) { + spin_unlock(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock(&base->lock); + } else { + spin_unlock_irq(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock_irq(&base->lock); + } + } + } + base->running_timer = NULL; + spin_unlock_irq(&base->lock); +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * Find out when the next timer event is due to happen. This + * is used on S/390 to stop all activity when a CPU is idle. + * This function needs to be called with interrupts disabled. + */ +static unsigned long __next_timer_interrupt(struct tvec_base *base) +{ + unsigned long timer_jiffies = base->timer_jiffies; + unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; + int index, slot, array, found = 0; + struct timer_list *nte; + struct tvec *varray[4]; + + /* Look for timer events in tv1. */ + index = slot = timer_jiffies & TVR_MASK; + do { + list_for_each_entry(nte, base->tv1.vec + slot, entry) { + if (tbase_get_deferrable(nte->base)) + continue; + + found = 1; + expires = nte->expires; + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + goto cascade; + return expires; + } + slot = (slot + 1) & TVR_MASK; + } while (slot != index); + +cascade: + /* Calculate the next cascade event */ + if (index) + timer_jiffies += TVR_SIZE - index; + timer_jiffies >>= TVR_BITS; + + /* Check tv2-tv5. */ + varray[0] = &base->tv2; + varray[1] = &base->tv3; + varray[2] = &base->tv4; + varray[3] = &base->tv5; + + for (array = 0; array < 4; array++) { + struct tvec *varp = varray[array]; + + index = slot = timer_jiffies & TVN_MASK; + do { + list_for_each_entry(nte, varp->vec + slot, entry) { + if (tbase_get_deferrable(nte->base)) + continue; + + found = 1; + if (time_before(nte->expires, expires)) + expires = nte->expires; + } + /* + * Do we still search for the first timer or are + * we looking up the cascade buckets ? + */ + if (found) { + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + break; + return expires; + } + slot = (slot + 1) & TVN_MASK; + } while (slot != index); + + if (index) + timer_jiffies += TVN_SIZE - index; + timer_jiffies >>= TVN_BITS; + } + return expires; +} + +/* + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +static unsigned long cmp_next_hrtimer_event(unsigned long now, + unsigned long expires) +{ + ktime_t hr_delta = hrtimer_get_next_event(); + struct timespec tsdelta; + unsigned long delta; + + if (hr_delta.tv64 == KTIME_MAX) + return expires; + + /* + * Expired timer available, let it expire in the next tick + */ + if (hr_delta.tv64 <= 0) + return now + 1; + + tsdelta = ktime_to_timespec(hr_delta); + delta = timespec_to_jiffies(&tsdelta); + + /* + * Limit the delta to the max value, which is checked in + * tick_nohz_stop_sched_tick(): + */ + if (delta > NEXT_TIMER_MAX_DELTA) + delta = NEXT_TIMER_MAX_DELTA; + + /* + * Take rounding errors in to account and make sure, that it + * expires in the next tick. Otherwise we go into an endless + * ping pong due to tick_nohz_stop_sched_tick() retriggering + * the timer softirq + */ + if (delta < 1) + delta = 1; + now += delta; + if (time_before(now, expires)) + return now; + return expires; +} + +/** + * get_next_timer_interrupt - return the jiffy of the next pending timer + * @now: current time (in jiffies) + */ +unsigned long get_next_timer_interrupt(unsigned long now) +{ + struct tvec_base *base = __this_cpu_read(tvec_bases); + unsigned long expires = now + NEXT_TIMER_MAX_DELTA; + + /* + * Pretend that there is no timer pending if the cpu is offline. + * Possible pending timers will be migrated later to an active cpu. + */ + if (cpu_is_offline(smp_processor_id())) + return expires; + + spin_lock(&base->lock); + if (base->active_timers) { + if (time_before_eq(base->next_timer, base->timer_jiffies)) + base->next_timer = __next_timer_interrupt(base); + expires = base->next_timer; + } + spin_unlock(&base->lock); + + if (time_before_eq(expires, now)) + return now; + + return cmp_next_hrtimer_event(now, expires); +} +#endif + +/* + * Called from the timer interrupt handler to charge one tick to the current + * process. user_tick is 1 if the tick is user time, 0 for system. + */ +void update_process_times(int user_tick) +{ + struct task_struct *p = current; + int cpu = smp_processor_id(); + + /* Note: this timer irq context must be accounted for as well. */ + account_process_tick(p, user_tick); + run_local_timers(); + rcu_check_callbacks(cpu, user_tick); +#ifdef CONFIG_IRQ_WORK + if (in_irq()) + irq_work_run(); +#endif + scheduler_tick(); + run_posix_cpu_timers(p); +} + +/* + * This function runs timers and the timer-tq in bottom half context. + */ +static void run_timer_softirq(struct softirq_action *h) +{ + struct tvec_base *base = __this_cpu_read(tvec_bases); + + hrtimer_run_pending(); + + if (time_after_eq(jiffies, base->timer_jiffies)) + __run_timers(base); +} + +/* + * Called by the local, per-CPU timer interrupt on SMP. + */ +void run_local_timers(void) +{ + hrtimer_run_queues(); + raise_softirq(TIMER_SOFTIRQ); +} + +#ifdef __ARCH_WANT_SYS_ALARM + +/* + * For backwards compatibility? This can be done in libc so Alpha + * and all newer ports shouldn't need it. + */ +SYSCALL_DEFINE1(alarm, unsigned int, seconds) +{ + return alarm_setitimer(seconds); +} + +#endif + +static void process_timeout(unsigned long __data) +{ + wake_up_process((struct task_struct *)__data); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns. The routine will return 0 + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. In this case the remaining time + * in jiffies will be returned, or 0 if the timer expired in time + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * In all cases the return value is guaranteed to be non-negative. + */ +signed long __sched schedule_timeout(signed long timeout) +{ + struct timer_list timer; + unsigned long expire; + + switch (timeout) + { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) { + printk(KERN_ERR "schedule_timeout: wrong timeout " + "value %lx\n", timeout); + dump_stack(); + current->state = TASK_RUNNING; + goto out; + } + } + + expire = timeout + jiffies; + + setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); + __mod_timer(&timer, expire, false, TIMER_NOT_PINNED); + schedule(); + del_singleshot_timer_sync(&timer); + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} +EXPORT_SYMBOL(schedule_timeout); + +/* + * We can use __set_current_state() here because schedule_timeout() calls + * schedule() unconditionally. + */ +signed long __sched schedule_timeout_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_interruptible); + +signed long __sched schedule_timeout_killable(signed long timeout) +{ + __set_current_state(TASK_KILLABLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_killable); + +signed long __sched schedule_timeout_uninterruptible(signed long timeout) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_uninterruptible); + +static int init_timers_cpu(int cpu) +{ + int j; + struct tvec_base *base; + static char tvec_base_done[NR_CPUS]; + + if (!tvec_base_done[cpu]) { + static char boot_done; + + if (boot_done) { + /* + * The APs use this path later in boot + */ + base = kzalloc_node(sizeof(*base), GFP_KERNEL, + cpu_to_node(cpu)); + if (!base) + return -ENOMEM; + + /* Make sure tvec_base has TIMER_FLAG_MASK bits free */ + if (WARN_ON(base != tbase_get_base(base))) { + kfree(base); + return -ENOMEM; + } + per_cpu(tvec_bases, cpu) = base; + } else { + /* + * This is for the boot CPU - we use compile-time + * static initialisation because per-cpu memory isn't + * ready yet and because the memory allocators are not + * initialised either. + */ + boot_done = 1; + base = &boot_tvec_bases; + } + spin_lock_init(&base->lock); + tvec_base_done[cpu] = 1; + } else { + base = per_cpu(tvec_bases, cpu); + } + + + for (j = 0; j < TVN_SIZE; j++) { + INIT_LIST_HEAD(base->tv5.vec + j); + INIT_LIST_HEAD(base->tv4.vec + j); + INIT_LIST_HEAD(base->tv3.vec + j); + INIT_LIST_HEAD(base->tv2.vec + j); + } + for (j = 0; j < TVR_SIZE; j++) + INIT_LIST_HEAD(base->tv1.vec + j); + + base->timer_jiffies = jiffies; + base->next_timer = base->timer_jiffies; + base->active_timers = 0; + base->all_timers = 0; + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) +{ + struct timer_list *timer; + + while (!list_empty(head)) { + timer = list_first_entry(head, struct timer_list, entry); + /* We ignore the accounting on the dying cpu */ + detach_timer(timer, false); + timer_set_base(timer, new_base); + internal_add_timer(new_base, timer); + } +} + +static void migrate_timers(int cpu) +{ + struct tvec_base *old_base; + struct tvec_base *new_base; + int i; + + BUG_ON(cpu_online(cpu)); + old_base = per_cpu(tvec_bases, cpu); + new_base = get_cpu_var(tvec_bases); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + BUG_ON(old_base->running_timer); + + for (i = 0; i < TVR_SIZE; i++) + migrate_timer_list(new_base, old_base->tv1.vec + i); + for (i = 0; i < TVN_SIZE; i++) { + migrate_timer_list(new_base, old_base->tv2.vec + i); + migrate_timer_list(new_base, old_base->tv3.vec + i); + migrate_timer_list(new_base, old_base->tv4.vec + i); + migrate_timer_list(new_base, old_base->tv5.vec + i); + } + + spin_unlock(&old_base->lock); + spin_unlock_irq(&new_base->lock); + put_cpu_var(tvec_bases); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int timer_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + int err; + + switch(action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + err = init_timers_cpu(cpu); + if (err < 0) + return notifier_from_errno(err); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + case CPU_DEAD_FROZEN: + migrate_timers(cpu); + break; +#endif + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block timers_nb = { + .notifier_call = timer_cpu_notify, +}; + + +void __init init_timers(void) +{ + int err; + + /* ensure there are enough low bits for flags in timer->base pointer */ + BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); + + err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + BUG_ON(err != NOTIFY_OK); + + init_timer_stats(); + register_cpu_notifier(&timers_nb); + open_softirq(TIMER_SOFTIRQ, run_timer_softirq); +} + +/** + * msleep - sleep safely even with waitqueue interruptions + * @msecs: Time in milliseconds to sleep for + */ +void msleep(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +} + +EXPORT_SYMBOL(msleep); + +/** + * msleep_interruptible - sleep waiting for signals + * @msecs: Time in milliseconds to sleep for + */ +unsigned long msleep_interruptible(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); + return jiffies_to_msecs(timeout); +} + +EXPORT_SYMBOL(msleep_interruptible); + +static int __sched do_usleep_range(unsigned long min, unsigned long max) +{ + ktime_t kmin; + unsigned long delta; + + kmin = ktime_set(0, min * NSEC_PER_USEC); + delta = (max - min) * NSEC_PER_USEC; + return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); +} + +/** + * usleep_range - Drop in replacement for udelay where wakeup is flexible + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + */ +void usleep_range(unsigned long min, unsigned long max) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + do_usleep_range(min, max); +} +EXPORT_SYMBOL(usleep_range); diff --git a/kernel/timeconst.bc b/kernel/timeconst.bc deleted file mode 100644 index 511bdf2..0000000 --- a/kernel/timeconst.bc +++ /dev/null @@ -1,108 +0,0 @@ -scale=0 - -define gcd(a,b) { - auto t; - while (b) { - t = b; - b = a % b; - a = t; - } - return a; -} - -/* Division by reciprocal multiplication. */ -define fmul(b,n,d) { - return (2^b*n+d-1)/d; -} - -/* Adjustment factor when a ceiling value is used. Use as: - (imul * n) + (fmulxx * n + fadjxx) >> xx) */ -define fadj(b,n,d) { - auto v; - d = d/gcd(n,d); - v = 2^b*(d-1)/d; - return v; -} - -/* Compute the appropriate mul/adj values as well as a shift count, - which brings the mul value into the range 2^b-1 <= x < 2^b. Such - a shift value will be correct in the signed integer range and off - by at most one in the upper half of the unsigned range. */ -define fmuls(b,n,d) { - auto s, m; - for (s = 0; 1; s++) { - m = fmul(s,n,d); - if (m >= 2^(b-1)) - return s; - } - return 0; -} - -define timeconst(hz) { - print "/* Automatically generated by kernel/timeconst.bc */\n" - print "/* Time conversion constants for HZ == ", hz, " */\n" - print "\n" - - print "#ifndef KERNEL_TIMECONST_H\n" - print "#define KERNEL_TIMECONST_H\n\n" - - print "#include \n" - print "#include \n\n" - - print "#if HZ != ", hz, "\n" - print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n" - print "#endif\n\n" - - if (hz < 2) { - print "#error Totally bogus HZ value!\n" - } else { - s=fmuls(32,1000,hz) - obase=16 - print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n" - print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n" - obase=10 - print "#define HZ_TO_MSEC_SHR32\t", s, "\n" - - s=fmuls(32,hz,1000) - obase=16 - print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n" - print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n" - obase=10 - print "#define MSEC_TO_HZ_SHR32\t", s, "\n" - - obase=10 - cd=gcd(hz,1000) - print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n" - print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n" - print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n" - print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n" - print "\n" - - s=fmuls(32,1000000,hz) - obase=16 - print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n" - print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n" - obase=10 - print "#define HZ_TO_USEC_SHR32\t", s, "\n" - - s=fmuls(32,hz,1000000) - obase=16 - print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n" - print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n" - obase=10 - print "#define USEC_TO_HZ_SHR32\t", s, "\n" - - obase=10 - cd=gcd(hz,1000000) - print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n" - print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n" - print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n" - print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n" - print "\n" - - print "#endif /* KERNEL_TIMECONST_H */\n" - } - halt -} - -timeconst(hz) diff --git a/kernel/timer.c b/kernel/timer.c deleted file mode 100644 index 3bb01a3..0000000 --- a/kernel/timer.c +++ /dev/null @@ -1,1734 +0,0 @@ -/* - * linux/kernel/timer.c - * - * Kernel internal timers - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. - * - * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to - * serialize accesses to xtime/lost_ticks). - * Copyright (C) 1998 Andrea Arcangeli - * 1999-03-10 Improved NTP compatibility by Ulrich Windl - * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love - * 2000-10-05 Implemented scalable SMP per-CPU timer handling. - * Copyright (C) 2000, 2001, 2002 Ingo Molnar - * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include - -__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - -/* - * per-CPU timer vector definitions: - */ -#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) -#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) -#define TVN_SIZE (1 << TVN_BITS) -#define TVR_SIZE (1 << TVR_BITS) -#define TVN_MASK (TVN_SIZE - 1) -#define TVR_MASK (TVR_SIZE - 1) -#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) - -struct tvec { - struct list_head vec[TVN_SIZE]; -}; - -struct tvec_root { - struct list_head vec[TVR_SIZE]; -}; - -struct tvec_base { - spinlock_t lock; - struct timer_list *running_timer; - unsigned long timer_jiffies; - unsigned long next_timer; - unsigned long active_timers; - unsigned long all_timers; - struct tvec_root tv1; - struct tvec tv2; - struct tvec tv3; - struct tvec tv4; - struct tvec tv5; -} ____cacheline_aligned; - -struct tvec_base boot_tvec_bases; -EXPORT_SYMBOL(boot_tvec_bases); -static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; - -/* Functions below help us manage 'deferrable' flag */ -static inline unsigned int tbase_get_deferrable(struct tvec_base *base) -{ - return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE); -} - -static inline unsigned int tbase_get_irqsafe(struct tvec_base *base) -{ - return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE); -} - -static inline struct tvec_base *tbase_get_base(struct tvec_base *base) -{ - return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK)); -} - -static inline void -timer_set_base(struct timer_list *timer, struct tvec_base *new_base) -{ - unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK; - - timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags); -} - -static unsigned long round_jiffies_common(unsigned long j, int cpu, - bool force_up) -{ - int rem; - unsigned long original = j; - - /* - * We don't want all cpus firing their timers at once hitting the - * same lock or cachelines, so we skew each extra cpu with an extra - * 3 jiffies. This 3 jiffies came originally from the mm/ code which - * already did this. - * The skew is done by adding 3*cpunr, then round, then subtract this - * extra offset again. - */ - j += cpu * 3; - - rem = j % HZ; - - /* - * If the target jiffie is just after a whole second (which can happen - * due to delays of the timer irq, long irq off times etc etc) then - * we should round down to the whole second, not up. Use 1/4th second - * as cutoff for this rounding as an extreme upper bound for this. - * But never round down if @force_up is set. - */ - if (rem < HZ/4 && !force_up) /* round down */ - j = j - rem; - else /* round up */ - j = j - rem + HZ; - - /* now that we have rounded, subtract the extra skew again */ - j -= cpu * 3; - - /* - * Make sure j is still in the future. Otherwise return the - * unmodified value. - */ - return time_is_after_jiffies(j) ? j : original; -} - -/** - * __round_jiffies - function to round jiffies to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * __round_jiffies() rounds an absolute time in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The exact rounding is skewed for each processor to avoid all - * processors firing at the exact same time, which could lead - * to lock contention or spurious cache line bouncing. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long __round_jiffies(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, false); -} -EXPORT_SYMBOL_GPL(__round_jiffies); - -/** - * __round_jiffies_relative - function to round jiffies to a full second - * @j: the time in (relative) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * __round_jiffies_relative() rounds a time delta in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The exact rounding is skewed for each processor to avoid all - * processors firing at the exact same time, which could lead - * to lock contention or spurious cache line bouncing. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long __round_jiffies_relative(unsigned long j, int cpu) -{ - unsigned long j0 = jiffies; - - /* Use j0 because jiffies might change while we run */ - return round_jiffies_common(j + j0, cpu, false) - j0; -} -EXPORT_SYMBOL_GPL(__round_jiffies_relative); - -/** - * round_jiffies - function to round jiffies to a full second - * @j: the time in (absolute) jiffies that should be rounded - * - * round_jiffies() rounds an absolute time in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long round_jiffies(unsigned long j) -{ - return round_jiffies_common(j, raw_smp_processor_id(), false); -} -EXPORT_SYMBOL_GPL(round_jiffies); - -/** - * round_jiffies_relative - function to round jiffies to a full second - * @j: the time in (relative) jiffies that should be rounded - * - * round_jiffies_relative() rounds a time delta in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long round_jiffies_relative(unsigned long j) -{ - return __round_jiffies_relative(j, raw_smp_processor_id()); -} -EXPORT_SYMBOL_GPL(round_jiffies_relative); - -/** - * __round_jiffies_up - function to round jiffies up to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * This is the same as __round_jiffies() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long __round_jiffies_up(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, true); -} -EXPORT_SYMBOL_GPL(__round_jiffies_up); - -/** - * __round_jiffies_up_relative - function to round jiffies up to a full second - * @j: the time in (relative) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * This is the same as __round_jiffies_relative() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) -{ - unsigned long j0 = jiffies; - - /* Use j0 because jiffies might change while we run */ - return round_jiffies_common(j + j0, cpu, true) - j0; -} -EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); - -/** - * round_jiffies_up - function to round jiffies up to a full second - * @j: the time in (absolute) jiffies that should be rounded - * - * This is the same as round_jiffies() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long round_jiffies_up(unsigned long j) -{ - return round_jiffies_common(j, raw_smp_processor_id(), true); -} -EXPORT_SYMBOL_GPL(round_jiffies_up); - -/** - * round_jiffies_up_relative - function to round jiffies up to a full second - * @j: the time in (relative) jiffies that should be rounded - * - * This is the same as round_jiffies_relative() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long round_jiffies_up_relative(unsigned long j) -{ - return __round_jiffies_up_relative(j, raw_smp_processor_id()); -} -EXPORT_SYMBOL_GPL(round_jiffies_up_relative); - -/** - * set_timer_slack - set the allowed slack for a timer - * @timer: the timer to be modified - * @slack_hz: the amount of time (in jiffies) allowed for rounding - * - * Set the amount of time, in jiffies, that a certain timer has - * in terms of slack. By setting this value, the timer subsystem - * will schedule the actual timer somewhere between - * the time mod_timer() asks for, and that time plus the slack. - * - * By setting the slack to -1, a percentage of the delay is used - * instead. - */ -void set_timer_slack(struct timer_list *timer, int slack_hz) -{ - timer->slack = slack_hz; -} -EXPORT_SYMBOL_GPL(set_timer_slack); - -/* - * If the list is empty, catch up ->timer_jiffies to the current time. - * The caller must hold the tvec_base lock. Returns true if the list - * was empty and therefore ->timer_jiffies was updated. - */ -static bool catchup_timer_jiffies(struct tvec_base *base) -{ - if (!base->all_timers) { - base->timer_jiffies = jiffies; - return true; - } - return false; -} - -static void -__internal_add_timer(struct tvec_base *base, struct timer_list *timer) -{ - unsigned long expires = timer->expires; - unsigned long idx = expires - base->timer_jiffies; - struct list_head *vec; - - if (idx < TVR_SIZE) { - int i = expires & TVR_MASK; - vec = base->tv1.vec + i; - } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { - int i = (expires >> TVR_BITS) & TVN_MASK; - vec = base->tv2.vec + i; - } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; - vec = base->tv3.vec + i; - } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; - vec = base->tv4.vec + i; - } else if ((signed long) idx < 0) { - /* - * Can happen if you add a timer with expires == jiffies, - * or you set a timer to go off in the past - */ - vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); - } else { - int i; - /* If the timeout is larger than MAX_TVAL (on 64-bit - * architectures or with CONFIG_BASE_SMALL=1) then we - * use the maximum timeout. - */ - if (idx > MAX_TVAL) { - idx = MAX_TVAL; - expires = idx + base->timer_jiffies; - } - i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; - vec = base->tv5.vec + i; - } - /* - * Timers are FIFO: - */ - list_add_tail(&timer->entry, vec); -} - -static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) -{ - (void)catchup_timer_jiffies(base); - __internal_add_timer(base, timer); - /* - * Update base->active_timers and base->next_timer - */ - if (!tbase_get_deferrable(timer->base)) { - if (!base->active_timers++ || - time_before(timer->expires, base->next_timer)) - base->next_timer = timer->expires; - } - base->all_timers++; -} - -#ifdef CONFIG_TIMER_STATS -void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) -{ - if (timer->start_site) - return; - - timer->start_site = addr; - memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); - timer->start_pid = current->pid; -} - -static void timer_stats_account_timer(struct timer_list *timer) -{ - unsigned int flag = 0; - - if (likely(!timer->start_site)) - return; - if (unlikely(tbase_get_deferrable(timer->base))) - flag |= TIMER_STATS_FLAG_DEFERRABLE; - - timer_stats_update_stats(timer, timer->start_pid, timer->start_site, - timer->function, timer->start_comm, flag); -} - -#else -static void timer_stats_account_timer(struct timer_list *timer) {} -#endif - -#ifdef CONFIG_DEBUG_OBJECTS_TIMERS - -static struct debug_obj_descr timer_debug_descr; - -static void *timer_debug_hint(void *addr) -{ - return ((struct timer_list *) addr)->function; -} - -/* - * fixup_init is called when: - * - an active object is initialized - */ -static int timer_fixup_init(void *addr, enum debug_obj_state state) -{ - struct timer_list *timer = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); - debug_object_init(timer, &timer_debug_descr); - return 1; - default: - return 0; - } -} - -/* Stub timer callback for improperly used timers. */ -static void stub_timer(unsigned long data) -{ - WARN_ON(1); -} - -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - */ -static int timer_fixup_activate(void *addr, enum debug_obj_state state) -{ - struct timer_list *timer = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (timer->entry.next == NULL && - timer->entry.prev == TIMER_ENTRY_STATIC) { - debug_object_init(timer, &timer_debug_descr); - debug_object_activate(timer, &timer_debug_descr); - return 0; - } else { - setup_timer(timer, stub_timer, 0); - return 1; - } - return 0; - - case ODEBUG_STATE_ACTIVE: - WARN_ON(1); - - default: - return 0; - } -} - -/* - * fixup_free is called when: - * - an active object is freed - */ -static int timer_fixup_free(void *addr, enum debug_obj_state state) -{ - struct timer_list *timer = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); - debug_object_free(timer, &timer_debug_descr); - return 1; - default: - return 0; - } -} - -/* - * fixup_assert_init is called when: - * - an untracked/uninit-ed object is found - */ -static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) -{ - struct timer_list *timer = addr; - - switch (state) { - case ODEBUG_STATE_NOTAVAILABLE: - if (timer->entry.prev == TIMER_ENTRY_STATIC) { - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - debug_object_init(timer, &timer_debug_descr); - return 0; - } else { - setup_timer(timer, stub_timer, 0); - return 1; - } - default: - return 0; - } -} - -static struct debug_obj_descr timer_debug_descr = { - .name = "timer_list", - .debug_hint = timer_debug_hint, - .fixup_init = timer_fixup_init, - .fixup_activate = timer_fixup_activate, - .fixup_free = timer_fixup_free, - .fixup_assert_init = timer_fixup_assert_init, -}; - -static inline void debug_timer_init(struct timer_list *timer) -{ - debug_object_init(timer, &timer_debug_descr); -} - -static inline void debug_timer_activate(struct timer_list *timer) -{ - debug_object_activate(timer, &timer_debug_descr); -} - -static inline void debug_timer_deactivate(struct timer_list *timer) -{ - debug_object_deactivate(timer, &timer_debug_descr); -} - -static inline void debug_timer_free(struct timer_list *timer) -{ - debug_object_free(timer, &timer_debug_descr); -} - -static inline void debug_timer_assert_init(struct timer_list *timer) -{ - debug_object_assert_init(timer, &timer_debug_descr); -} - -static void do_init_timer(struct timer_list *timer, unsigned int flags, - const char *name, struct lock_class_key *key); - -void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags, - const char *name, struct lock_class_key *key) -{ - debug_object_init_on_stack(timer, &timer_debug_descr); - do_init_timer(timer, flags, name, key); -} -EXPORT_SYMBOL_GPL(init_timer_on_stack_key); - -void destroy_timer_on_stack(struct timer_list *timer) -{ - debug_object_free(timer, &timer_debug_descr); -} -EXPORT_SYMBOL_GPL(destroy_timer_on_stack); - -#else -static inline void debug_timer_init(struct timer_list *timer) { } -static inline void debug_timer_activate(struct timer_list *timer) { } -static inline void debug_timer_deactivate(struct timer_list *timer) { } -static inline void debug_timer_assert_init(struct timer_list *timer) { } -#endif - -static inline void debug_init(struct timer_list *timer) -{ - debug_timer_init(timer); - trace_timer_init(timer); -} - -static inline void -debug_activate(struct timer_list *timer, unsigned long expires) -{ - debug_timer_activate(timer); - trace_timer_start(timer, expires); -} - -static inline void debug_deactivate(struct timer_list *timer) -{ - debug_timer_deactivate(timer); - trace_timer_cancel(timer); -} - -static inline void debug_assert_init(struct timer_list *timer) -{ - debug_timer_assert_init(timer); -} - -static void do_init_timer(struct timer_list *timer, unsigned int flags, - const char *name, struct lock_class_key *key) -{ - struct tvec_base *base = __raw_get_cpu_var(tvec_bases); - - timer->entry.next = NULL; - timer->base = (void *)((unsigned long)base | flags); - timer->slack = -1; -#ifdef CONFIG_TIMER_STATS - timer->start_site = NULL; - timer->start_pid = -1; - memset(timer->start_comm, 0, TASK_COMM_LEN); -#endif - lockdep_init_map(&timer->lockdep_map, name, key, 0); -} - -/** - * init_timer_key - initialize a timer - * @timer: the timer to be initialized - * @flags: timer flags - * @name: name of the timer - * @key: lockdep class key of the fake lock used for tracking timer - * sync lock dependencies - * - * init_timer_key() must be done to a timer prior calling *any* of the - * other timer functions. - */ -void init_timer_key(struct timer_list *timer, unsigned int flags, - const char *name, struct lock_class_key *key) -{ - debug_init(timer); - do_init_timer(timer, flags, name, key); -} -EXPORT_SYMBOL(init_timer_key); - -static inline void detach_timer(struct timer_list *timer, bool clear_pending) -{ - struct list_head *entry = &timer->entry; - - debug_deactivate(timer); - - __list_del(entry->prev, entry->next); - if (clear_pending) - entry->next = NULL; - entry->prev = LIST_POISON2; -} - -static inline void -detach_expired_timer(struct timer_list *timer, struct tvec_base *base) -{ - detach_timer(timer, true); - if (!tbase_get_deferrable(timer->base)) - base->active_timers--; - base->all_timers--; - (void)catchup_timer_jiffies(base); -} - -static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, - bool clear_pending) -{ - if (!timer_pending(timer)) - return 0; - - detach_timer(timer, clear_pending); - if (!tbase_get_deferrable(timer->base)) { - base->active_timers--; - if (timer->expires == base->next_timer) - base->next_timer = base->timer_jiffies; - } - base->all_timers--; - (void)catchup_timer_jiffies(base); - return 1; -} - -/* - * We are using hashed locking: holding per_cpu(tvec_bases).lock - * means that all timers which are tied to this base via timer->base are - * locked, and the base itself is locked too. - * - * So __run_timers/migrate_timers can safely modify all timers which could - * be found on ->tvX lists. - * - * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. - */ -static struct tvec_base *lock_timer_base(struct timer_list *timer, - unsigned long *flags) - __acquires(timer->base->lock) -{ - struct tvec_base *base; - - for (;;) { - struct tvec_base *prelock_base = timer->base; - base = tbase_get_base(prelock_base); - if (likely(base != NULL)) { - spin_lock_irqsave(&base->lock, *flags); - if (likely(prelock_base == timer->base)) - return base; - /* The timer has migrated to another CPU */ - spin_unlock_irqrestore(&base->lock, *flags); - } - cpu_relax(); - } -} - -static inline int -__mod_timer(struct timer_list *timer, unsigned long expires, - bool pending_only, int pinned) -{ - struct tvec_base *base, *new_base; - unsigned long flags; - int ret = 0 , cpu; - - timer_stats_timer_set_start_info(timer); - BUG_ON(!timer->function); - - base = lock_timer_base(timer, &flags); - - ret = detach_if_pending(timer, base, false); - if (!ret && pending_only) - goto out_unlock; - - debug_activate(timer, expires); - - cpu = get_nohz_timer_target(pinned); - new_base = per_cpu(tvec_bases, cpu); - - if (base != new_base) { - /* - * We are trying to schedule the timer on the local CPU. - * However we can't change timer's base while it is running, - * otherwise del_timer_sync() can't detect that the timer's - * handler yet has not finished. This also guarantees that - * the timer is serialized wrt itself. - */ - if (likely(base->running_timer != timer)) { - /* See the comment in lock_timer_base() */ - timer_set_base(timer, NULL); - spin_unlock(&base->lock); - base = new_base; - spin_lock(&base->lock); - timer_set_base(timer, base); - } - } - - timer->expires = expires; - internal_add_timer(base, timer); - -out_unlock: - spin_unlock_irqrestore(&base->lock, flags); - - return ret; -} - -/** - * mod_timer_pending - modify a pending timer's timeout - * @timer: the pending timer to be modified - * @expires: new timeout in jiffies - * - * mod_timer_pending() is the same for pending timers as mod_timer(), - * but will not re-activate and modify already deleted timers. - * - * It is useful for unserialized use of timers. - */ -int mod_timer_pending(struct timer_list *timer, unsigned long expires) -{ - return __mod_timer(timer, expires, true, TIMER_NOT_PINNED); -} -EXPORT_SYMBOL(mod_timer_pending); - -/* - * Decide where to put the timer while taking the slack into account - * - * Algorithm: - * 1) calculate the maximum (absolute) time - * 2) calculate the highest bit where the expires and new max are different - * 3) use this bit to make a mask - * 4) use the bitmask to round down the maximum time, so that all last - * bits are zeros - */ -static inline -unsigned long apply_slack(struct timer_list *timer, unsigned long expires) -{ - unsigned long expires_limit, mask; - int bit; - - if (timer->slack >= 0) { - expires_limit = expires + timer->slack; - } else { - long delta = expires - jiffies; - - if (delta < 256) - return expires; - - expires_limit = expires + delta / 256; - } - mask = expires ^ expires_limit; - if (mask == 0) - return expires; - - bit = find_last_bit(&mask, BITS_PER_LONG); - - mask = (1UL << bit) - 1; - - expires_limit = expires_limit & ~(mask); - - return expires_limit; -} - -/** - * mod_timer - modify a timer's timeout - * @timer: the timer to be modified - * @expires: new timeout in jiffies - * - * mod_timer() is a more efficient way to update the expire field of an - * active timer (if the timer is inactive it will be activated) - * - * mod_timer(timer, expires) is equivalent to: - * - * del_timer(timer); timer->expires = expires; add_timer(timer); - * - * Note that if there are multiple unserialized concurrent users of the - * same timer, then mod_timer() is the only safe way to modify the timeout, - * since add_timer() cannot modify an already running timer. - * - * The function returns whether it has modified a pending timer or not. - * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an - * active timer returns 1.) - */ -int mod_timer(struct timer_list *timer, unsigned long expires) -{ - expires = apply_slack(timer, expires); - - /* - * This is a common optimization triggered by the - * networking code - if the timer is re-modified - * to be the same thing then just return: - */ - if (timer_pending(timer) && timer->expires == expires) - return 1; - - return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); -} -EXPORT_SYMBOL(mod_timer); - -/** - * mod_timer_pinned - modify a timer's timeout - * @timer: the timer to be modified - * @expires: new timeout in jiffies - * - * mod_timer_pinned() is a way to update the expire field of an - * active timer (if the timer is inactive it will be activated) - * and to ensure that the timer is scheduled on the current CPU. - * - * Note that this does not prevent the timer from being migrated - * when the current CPU goes offline. If this is a problem for - * you, use CPU-hotplug notifiers to handle it correctly, for - * example, cancelling the timer when the corresponding CPU goes - * offline. - * - * mod_timer_pinned(timer, expires) is equivalent to: - * - * del_timer(timer); timer->expires = expires; add_timer(timer); - */ -int mod_timer_pinned(struct timer_list *timer, unsigned long expires) -{ - if (timer->expires == expires && timer_pending(timer)) - return 1; - - return __mod_timer(timer, expires, false, TIMER_PINNED); -} -EXPORT_SYMBOL(mod_timer_pinned); - -/** - * add_timer - start a timer - * @timer: the timer to be added - * - * The kernel will do a ->function(->data) callback from the - * timer interrupt at the ->expires point in the future. The - * current time is 'jiffies'. - * - * The timer's ->expires, ->function (and if the handler uses it, ->data) - * fields must be set prior calling this function. - * - * Timers with an ->expires field in the past will be executed in the next - * timer tick. - */ -void add_timer(struct timer_list *timer) -{ - BUG_ON(timer_pending(timer)); - mod_timer(timer, timer->expires); -} -EXPORT_SYMBOL(add_timer); - -/** - * add_timer_on - start a timer on a particular CPU - * @timer: the timer to be added - * @cpu: the CPU to start it on - * - * This is not very scalable on SMP. Double adds are not possible. - */ -void add_timer_on(struct timer_list *timer, int cpu) -{ - struct tvec_base *base = per_cpu(tvec_bases, cpu); - unsigned long flags; - - timer_stats_timer_set_start_info(timer); - BUG_ON(timer_pending(timer) || !timer->function); - spin_lock_irqsave(&base->lock, flags); - timer_set_base(timer, base); - debug_activate(timer, timer->expires); - internal_add_timer(base, timer); - /* - * Check whether the other CPU is in dynticks mode and needs - * to be triggered to reevaluate the timer wheel. - * We are protected against the other CPU fiddling - * with the timer by holding the timer base lock. This also - * makes sure that a CPU on the way to stop its tick can not - * evaluate the timer wheel. - * - * Spare the IPI for deferrable timers on idle targets though. - * The next busy ticks will take care of it. Except full dynticks - * require special care against races with idle_cpu(), lets deal - * with that later. - */ - if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu)) - wake_up_nohz_cpu(cpu); - - spin_unlock_irqrestore(&base->lock, flags); -} -EXPORT_SYMBOL_GPL(add_timer_on); - -/** - * del_timer - deactive a timer. - * @timer: the timer to be deactivated - * - * del_timer() deactivates a timer - this works on both active and inactive - * timers. - * - * The function returns whether it has deactivated a pending timer or not. - * (ie. del_timer() of an inactive timer returns 0, del_timer() of an - * active timer returns 1.) - */ -int del_timer(struct timer_list *timer) -{ - struct tvec_base *base; - unsigned long flags; - int ret = 0; - - debug_assert_init(timer); - - timer_stats_timer_clear_start_info(timer); - if (timer_pending(timer)) { - base = lock_timer_base(timer, &flags); - ret = detach_if_pending(timer, base, true); - spin_unlock_irqrestore(&base->lock, flags); - } - - return ret; -} -EXPORT_SYMBOL(del_timer); - -/** - * try_to_del_timer_sync - Try to deactivate a timer - * @timer: timer do del - * - * This function tries to deactivate a timer. Upon successful (ret >= 0) - * exit the timer is not queued and the handler is not running on any CPU. - */ -int try_to_del_timer_sync(struct timer_list *timer) -{ - struct tvec_base *base; - unsigned long flags; - int ret = -1; - - debug_assert_init(timer); - - base = lock_timer_base(timer, &flags); - - if (base->running_timer != timer) { - timer_stats_timer_clear_start_info(timer); - ret = detach_if_pending(timer, base, true); - } - spin_unlock_irqrestore(&base->lock, flags); - - return ret; -} -EXPORT_SYMBOL(try_to_del_timer_sync); - -#ifdef CONFIG_SMP -/** - * del_timer_sync - deactivate a timer and wait for the handler to finish. - * @timer: the timer to be deactivated - * - * This function only differs from del_timer() on SMP: besides deactivating - * the timer it also makes sure the handler has finished executing on other - * CPUs. - * - * Synchronization rules: Callers must prevent restarting of the timer, - * otherwise this function is meaningless. It must not be called from - * interrupt contexts unless the timer is an irqsafe one. The caller must - * not hold locks which would prevent completion of the timer's - * handler. The timer's handler must not call add_timer_on(). Upon exit the - * timer is not queued and the handler is not running on any CPU. - * - * Note: For !irqsafe timers, you must not hold locks that are held in - * interrupt context while calling this function. Even if the lock has - * nothing to do with the timer in question. Here's why: - * - * CPU0 CPU1 - * ---- ---- - * - * call_timer_fn(); - * base->running_timer = mytimer; - * spin_lock_irq(somelock); - * - * spin_lock(somelock); - * del_timer_sync(mytimer); - * while (base->running_timer == mytimer); - * - * Now del_timer_sync() will never return and never release somelock. - * The interrupt on the other CPU is waiting to grab somelock but - * it has interrupted the softirq that CPU0 is waiting to finish. - * - * The function returns whether it has deactivated a pending timer or not. - */ -int del_timer_sync(struct timer_list *timer) -{ -#ifdef CONFIG_LOCKDEP - unsigned long flags; - - /* - * If lockdep gives a backtrace here, please reference - * the synchronization rules above. - */ - local_irq_save(flags); - lock_map_acquire(&timer->lockdep_map); - lock_map_release(&timer->lockdep_map); - local_irq_restore(flags); -#endif - /* - * don't use it in hardirq context, because it - * could lead to deadlock. - */ - WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base)); - for (;;) { - int ret = try_to_del_timer_sync(timer); - if (ret >= 0) - return ret; - cpu_relax(); - } -} -EXPORT_SYMBOL(del_timer_sync); -#endif - -static int cascade(struct tvec_base *base, struct tvec *tv, int index) -{ - /* cascade all the timers from tv up one level */ - struct timer_list *timer, *tmp; - struct list_head tv_list; - - list_replace_init(tv->vec + index, &tv_list); - - /* - * We are removing _all_ timers from the list, so we - * don't have to detach them individually. - */ - list_for_each_entry_safe(timer, tmp, &tv_list, entry) { - BUG_ON(tbase_get_base(timer->base) != base); - /* No accounting, while moving them */ - __internal_add_timer(base, timer); - } - - return index; -} - -static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), - unsigned long data) -{ - int count = preempt_count(); - -#ifdef CONFIG_LOCKDEP - /* - * It is permissible to free the timer from inside the - * function that is called from it, this we need to take into - * account for lockdep too. To avoid bogus "held lock freed" - * warnings as well as problems when looking into - * timer->lockdep_map, make a copy and use that here. - */ - struct lockdep_map lockdep_map; - - lockdep_copy_map(&lockdep_map, &timer->lockdep_map); -#endif - /* - * Couple the lock chain with the lock chain at - * del_timer_sync() by acquiring the lock_map around the fn() - * call here and in del_timer_sync(). - */ - lock_map_acquire(&lockdep_map); - - trace_timer_expire_entry(timer); - fn(data); - trace_timer_expire_exit(timer); - - lock_map_release(&lockdep_map); - - if (count != preempt_count()) { - WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", - fn, count, preempt_count()); - /* - * Restore the preempt count. That gives us a decent - * chance to survive and extract information. If the - * callback kept a lock held, bad luck, but not worse - * than the BUG() we had. - */ - preempt_count_set(count); - } -} - -#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) - -/** - * __run_timers - run all expired timers (if any) on this CPU. - * @base: the timer vector to be processed. - * - * This function cascades all vectors and executes all expired timer - * vectors. - */ -static inline void __run_timers(struct tvec_base *base) -{ - struct timer_list *timer; - - spin_lock_irq(&base->lock); - if (catchup_timer_jiffies(base)) { - spin_unlock_irq(&base->lock); - return; - } - while (time_after_eq(jiffies, base->timer_jiffies)) { - struct list_head work_list; - struct list_head *head = &work_list; - int index = base->timer_jiffies & TVR_MASK; - - /* - * Cascade timers: - */ - if (!index && - (!cascade(base, &base->tv2, INDEX(0))) && - (!cascade(base, &base->tv3, INDEX(1))) && - !cascade(base, &base->tv4, INDEX(2))) - cascade(base, &base->tv5, INDEX(3)); - ++base->timer_jiffies; - list_replace_init(base->tv1.vec + index, head); - while (!list_empty(head)) { - void (*fn)(unsigned long); - unsigned long data; - bool irqsafe; - - timer = list_first_entry(head, struct timer_list,entry); - fn = timer->function; - data = timer->data; - irqsafe = tbase_get_irqsafe(timer->base); - - timer_stats_account_timer(timer); - - base->running_timer = timer; - detach_expired_timer(timer, base); - - if (irqsafe) { - spin_unlock(&base->lock); - call_timer_fn(timer, fn, data); - spin_lock(&base->lock); - } else { - spin_unlock_irq(&base->lock); - call_timer_fn(timer, fn, data); - spin_lock_irq(&base->lock); - } - } - } - base->running_timer = NULL; - spin_unlock_irq(&base->lock); -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * Find out when the next timer event is due to happen. This - * is used on S/390 to stop all activity when a CPU is idle. - * This function needs to be called with interrupts disabled. - */ -static unsigned long __next_timer_interrupt(struct tvec_base *base) -{ - unsigned long timer_jiffies = base->timer_jiffies; - unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; - int index, slot, array, found = 0; - struct timer_list *nte; - struct tvec *varray[4]; - - /* Look for timer events in tv1. */ - index = slot = timer_jiffies & TVR_MASK; - do { - list_for_each_entry(nte, base->tv1.vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) - continue; - - found = 1; - expires = nte->expires; - /* Look at the cascade bucket(s)? */ - if (!index || slot < index) - goto cascade; - return expires; - } - slot = (slot + 1) & TVR_MASK; - } while (slot != index); - -cascade: - /* Calculate the next cascade event */ - if (index) - timer_jiffies += TVR_SIZE - index; - timer_jiffies >>= TVR_BITS; - - /* Check tv2-tv5. */ - varray[0] = &base->tv2; - varray[1] = &base->tv3; - varray[2] = &base->tv4; - varray[3] = &base->tv5; - - for (array = 0; array < 4; array++) { - struct tvec *varp = varray[array]; - - index = slot = timer_jiffies & TVN_MASK; - do { - list_for_each_entry(nte, varp->vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) - continue; - - found = 1; - if (time_before(nte->expires, expires)) - expires = nte->expires; - } - /* - * Do we still search for the first timer or are - * we looking up the cascade buckets ? - */ - if (found) { - /* Look at the cascade bucket(s)? */ - if (!index || slot < index) - break; - return expires; - } - slot = (slot + 1) & TVN_MASK; - } while (slot != index); - - if (index) - timer_jiffies += TVN_SIZE - index; - timer_jiffies >>= TVN_BITS; - } - return expires; -} - -/* - * Check, if the next hrtimer event is before the next timer wheel - * event: - */ -static unsigned long cmp_next_hrtimer_event(unsigned long now, - unsigned long expires) -{ - ktime_t hr_delta = hrtimer_get_next_event(); - struct timespec tsdelta; - unsigned long delta; - - if (hr_delta.tv64 == KTIME_MAX) - return expires; - - /* - * Expired timer available, let it expire in the next tick - */ - if (hr_delta.tv64 <= 0) - return now + 1; - - tsdelta = ktime_to_timespec(hr_delta); - delta = timespec_to_jiffies(&tsdelta); - - /* - * Limit the delta to the max value, which is checked in - * tick_nohz_stop_sched_tick(): - */ - if (delta > NEXT_TIMER_MAX_DELTA) - delta = NEXT_TIMER_MAX_DELTA; - - /* - * Take rounding errors in to account and make sure, that it - * expires in the next tick. Otherwise we go into an endless - * ping pong due to tick_nohz_stop_sched_tick() retriggering - * the timer softirq - */ - if (delta < 1) - delta = 1; - now += delta; - if (time_before(now, expires)) - return now; - return expires; -} - -/** - * get_next_timer_interrupt - return the jiffy of the next pending timer - * @now: current time (in jiffies) - */ -unsigned long get_next_timer_interrupt(unsigned long now) -{ - struct tvec_base *base = __this_cpu_read(tvec_bases); - unsigned long expires = now + NEXT_TIMER_MAX_DELTA; - - /* - * Pretend that there is no timer pending if the cpu is offline. - * Possible pending timers will be migrated later to an active cpu. - */ - if (cpu_is_offline(smp_processor_id())) - return expires; - - spin_lock(&base->lock); - if (base->active_timers) { - if (time_before_eq(base->next_timer, base->timer_jiffies)) - base->next_timer = __next_timer_interrupt(base); - expires = base->next_timer; - } - spin_unlock(&base->lock); - - if (time_before_eq(expires, now)) - return now; - - return cmp_next_hrtimer_event(now, expires); -} -#endif - -/* - * Called from the timer interrupt handler to charge one tick to the current - * process. user_tick is 1 if the tick is user time, 0 for system. - */ -void update_process_times(int user_tick) -{ - struct task_struct *p = current; - int cpu = smp_processor_id(); - - /* Note: this timer irq context must be accounted for as well. */ - account_process_tick(p, user_tick); - run_local_timers(); - rcu_check_callbacks(cpu, user_tick); -#ifdef CONFIG_IRQ_WORK - if (in_irq()) - irq_work_run(); -#endif - scheduler_tick(); - run_posix_cpu_timers(p); -} - -/* - * This function runs timers and the timer-tq in bottom half context. - */ -static void run_timer_softirq(struct softirq_action *h) -{ - struct tvec_base *base = __this_cpu_read(tvec_bases); - - hrtimer_run_pending(); - - if (time_after_eq(jiffies, base->timer_jiffies)) - __run_timers(base); -} - -/* - * Called by the local, per-CPU timer interrupt on SMP. - */ -void run_local_timers(void) -{ - hrtimer_run_queues(); - raise_softirq(TIMER_SOFTIRQ); -} - -#ifdef __ARCH_WANT_SYS_ALARM - -/* - * For backwards compatibility? This can be done in libc so Alpha - * and all newer ports shouldn't need it. - */ -SYSCALL_DEFINE1(alarm, unsigned int, seconds) -{ - return alarm_setitimer(seconds); -} - -#endif - -static void process_timeout(unsigned long __data) -{ - wake_up_process((struct task_struct *)__data); -} - -/** - * schedule_timeout - sleep until timeout - * @timeout: timeout value in jiffies - * - * Make the current task sleep until @timeout jiffies have - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns. The routine will return 0 - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. In this case the remaining time - * in jiffies will be returned, or 0 if the timer expired in time - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule - * the CPU away without a bound on the timeout. In this case the return - * value will be %MAX_SCHEDULE_TIMEOUT. - * - * In all cases the return value is guaranteed to be non-negative. - */ -signed long __sched schedule_timeout(signed long timeout) -{ - struct timer_list timer; - unsigned long expire; - - switch (timeout) - { - case MAX_SCHEDULE_TIMEOUT: - /* - * These two special cases are useful to be comfortable - * in the caller. Nothing more. We could take - * MAX_SCHEDULE_TIMEOUT from one of the negative value - * but I' d like to return a valid offset (>=0) to allow - * the caller to do everything it want with the retval. - */ - schedule(); - goto out; - default: - /* - * Another bit of PARANOID. Note that the retval will be - * 0 since no piece of kernel is supposed to do a check - * for a negative retval of schedule_timeout() (since it - * should never happens anyway). You just have the printk() - * that will tell you if something is gone wrong and where. - */ - if (timeout < 0) { - printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx\n", timeout); - dump_stack(); - current->state = TASK_RUNNING; - goto out; - } - } - - expire = timeout + jiffies; - - setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire, false, TIMER_NOT_PINNED); - schedule(); - del_singleshot_timer_sync(&timer); - - /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer); - - timeout = expire - jiffies; - - out: - return timeout < 0 ? 0 : timeout; -} -EXPORT_SYMBOL(schedule_timeout); - -/* - * We can use __set_current_state() here because schedule_timeout() calls - * schedule() unconditionally. - */ -signed long __sched schedule_timeout_interruptible(signed long timeout) -{ - __set_current_state(TASK_INTERRUPTIBLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_interruptible); - -signed long __sched schedule_timeout_killable(signed long timeout) -{ - __set_current_state(TASK_KILLABLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_killable); - -signed long __sched schedule_timeout_uninterruptible(signed long timeout) -{ - __set_current_state(TASK_UNINTERRUPTIBLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_uninterruptible); - -static int init_timers_cpu(int cpu) -{ - int j; - struct tvec_base *base; - static char tvec_base_done[NR_CPUS]; - - if (!tvec_base_done[cpu]) { - static char boot_done; - - if (boot_done) { - /* - * The APs use this path later in boot - */ - base = kzalloc_node(sizeof(*base), GFP_KERNEL, - cpu_to_node(cpu)); - if (!base) - return -ENOMEM; - - /* Make sure tvec_base has TIMER_FLAG_MASK bits free */ - if (WARN_ON(base != tbase_get_base(base))) { - kfree(base); - return -ENOMEM; - } - per_cpu(tvec_bases, cpu) = base; - } else { - /* - * This is for the boot CPU - we use compile-time - * static initialisation because per-cpu memory isn't - * ready yet and because the memory allocators are not - * initialised either. - */ - boot_done = 1; - base = &boot_tvec_bases; - } - spin_lock_init(&base->lock); - tvec_base_done[cpu] = 1; - } else { - base = per_cpu(tvec_bases, cpu); - } - - - for (j = 0; j < TVN_SIZE; j++) { - INIT_LIST_HEAD(base->tv5.vec + j); - INIT_LIST_HEAD(base->tv4.vec + j); - INIT_LIST_HEAD(base->tv3.vec + j); - INIT_LIST_HEAD(base->tv2.vec + j); - } - for (j = 0; j < TVR_SIZE; j++) - INIT_LIST_HEAD(base->tv1.vec + j); - - base->timer_jiffies = jiffies; - base->next_timer = base->timer_jiffies; - base->active_timers = 0; - base->all_timers = 0; - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) -{ - struct timer_list *timer; - - while (!list_empty(head)) { - timer = list_first_entry(head, struct timer_list, entry); - /* We ignore the accounting on the dying cpu */ - detach_timer(timer, false); - timer_set_base(timer, new_base); - internal_add_timer(new_base, timer); - } -} - -static void migrate_timers(int cpu) -{ - struct tvec_base *old_base; - struct tvec_base *new_base; - int i; - - BUG_ON(cpu_online(cpu)); - old_base = per_cpu(tvec_bases, cpu); - new_base = get_cpu_var(tvec_bases); - /* - * The caller is globally serialized and nobody else - * takes two locks at once, deadlock is not possible. - */ - spin_lock_irq(&new_base->lock); - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - - BUG_ON(old_base->running_timer); - - for (i = 0; i < TVR_SIZE; i++) - migrate_timer_list(new_base, old_base->tv1.vec + i); - for (i = 0; i < TVN_SIZE; i++) { - migrate_timer_list(new_base, old_base->tv2.vec + i); - migrate_timer_list(new_base, old_base->tv3.vec + i); - migrate_timer_list(new_base, old_base->tv4.vec + i); - migrate_timer_list(new_base, old_base->tv5.vec + i); - } - - spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); - put_cpu_var(tvec_bases); -} -#endif /* CONFIG_HOTPLUG_CPU */ - -static int timer_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - int err; - - switch(action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - err = init_timers_cpu(cpu); - if (err < 0) - return notifier_from_errno(err); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - migrate_timers(cpu); - break; -#endif - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block timers_nb = { - .notifier_call = timer_cpu_notify, -}; - - -void __init init_timers(void) -{ - int err; - - /* ensure there are enough low bits for flags in timer->base pointer */ - BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); - - err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - BUG_ON(err != NOTIFY_OK); - - init_timer_stats(); - register_cpu_notifier(&timers_nb); - open_softirq(TIMER_SOFTIRQ, run_timer_softirq); -} - -/** - * msleep - sleep safely even with waitqueue interruptions - * @msecs: Time in milliseconds to sleep for - */ -void msleep(unsigned int msecs) -{ - unsigned long timeout = msecs_to_jiffies(msecs) + 1; - - while (timeout) - timeout = schedule_timeout_uninterruptible(timeout); -} - -EXPORT_SYMBOL(msleep); - -/** - * msleep_interruptible - sleep waiting for signals - * @msecs: Time in milliseconds to sleep for - */ -unsigned long msleep_interruptible(unsigned int msecs) -{ - unsigned long timeout = msecs_to_jiffies(msecs) + 1; - - while (timeout && !signal_pending(current)) - timeout = schedule_timeout_interruptible(timeout); - return jiffies_to_msecs(timeout); -} - -EXPORT_SYMBOL(msleep_interruptible); - -static int __sched do_usleep_range(unsigned long min, unsigned long max) -{ - ktime_t kmin; - unsigned long delta; - - kmin = ktime_set(0, min * NSEC_PER_USEC); - delta = (max - min) * NSEC_PER_USEC; - return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); -} - -/** - * usleep_range - Drop in replacement for udelay where wakeup is flexible - * @min: Minimum time in usecs to sleep - * @max: Maximum time in usecs to sleep - */ -void usleep_range(unsigned long min, unsigned long max) -{ - __set_current_state(TASK_UNINTERRUPTIBLE); - do_usleep_range(min, max); -} -EXPORT_SYMBOL(usleep_range); -- cgit v1.1 From d6f93829811a3e74f58e3c3823d507411eed651a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Sun, 22 Jun 2014 01:29:13 +0200 Subject: timer: Store cpu-number in struct tvec_base Timers are serviced by the tick. But when a timer is enqueued on a dynticks target, we need to kick it in order to make it reconsider the next tick to schedule to correctly handle the timer's expiring time. Now while this kick is correctly performed for add_timer_on(), the mod_timer*() family has been a bit neglected. To prepare for fixing this, we need internal_add_timer() to be able to resolve the CPU target associated to a timer's object 'base' so that the kick can be centralized there. This can't be passed as an argument as not all the callers know the CPU number of a timer's base. So lets store it in the struct tvec_base to resolve the CPU without much overhead. It is set once for good at every CPU's first boot. Signed-off-by: Viresh Kumar Signed-off-by: Frederic Weisbecker Link: http://lkml.kernel.org/r/1403393357-2070-2-git-send-email-fweisbec@gmail.com Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3bb01a3..9e5f4f2 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -82,6 +82,7 @@ struct tvec_base { unsigned long next_timer; unsigned long active_timers; unsigned long all_timers; + int cpu; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -1568,6 +1569,7 @@ static int init_timers_cpu(int cpu) } spin_lock_init(&base->lock); tvec_base_done[cpu] = 1; + base->cpu = cpu; } else { base = per_cpu(tvec_bases, cpu); } -- cgit v1.1 From 9f6d9baaa8ca94b48aea495261cadaf2967c7784 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Sun, 22 Jun 2014 01:29:14 +0200 Subject: timer: Kick dynticks targets on mod_timer*() calls When a timer is enqueued or modified on a dynticks target, that CPU must re-evaluate the next tick to service that timer. The tick re-evaluation is performed by an IPI kick on the target. Now while we correctly call wake_up_nohz_cpu() from add_timer_on(), the mod_timer*() API family doesn't support so well dynticks targets. The reason for this is likely that __mod_timer() isn't supposed to select an idle target for a timer, unless that target is the current CPU, in which case a dynticks idle kick isn't actually needed. But there is a small race window lurking behind that assumption: the elected target has all the time to turn dynticks idle between the call to get_nohz_timer_target() and the locking of its base. Hence a risk that we enqueue a timer on a dynticks idle destination without kicking it. As a result, the timer might be serviced too late in the future. Also a target elected by __mod_timer() can be in full dynticks mode and thus require to be kicked as well. And unlike idle dynticks, this concern both local and remote targets. To fix this whole issue, lets centralize the dynticks kick to internal_add_timer() so that it is well handled for all sort of timer enqueue. Even timer migration is concerned so that a full dynticks target is correctly kicked as needed when timers are migrating to it. Signed-off-by: Viresh Kumar Signed-off-by: Frederic Weisbecker Link: http://lkml.kernel.org/r/1403393357-2070-3-git-send-email-fweisbec@gmail.com Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 9e5f4f2..aca5dfe 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -410,6 +410,22 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) base->next_timer = timer->expires; } base->all_timers++; + + /* + * Check whether the other CPU is in dynticks mode and needs + * to be triggered to reevaluate the timer wheel. + * We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to stop its tick can not + * evaluate the timer wheel. + * + * Spare the IPI for deferrable timers on idle targets though. + * The next busy ticks will take care of it. Except full dynticks + * require special care against races with idle_cpu(), lets deal + * with that later. + */ + if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu)) + wake_up_nohz_cpu(base->cpu); } #ifdef CONFIG_TIMER_STATS @@ -949,22 +965,6 @@ void add_timer_on(struct timer_list *timer, int cpu) timer_set_base(timer, base); debug_activate(timer, timer->expires); internal_add_timer(base, timer); - /* - * Check whether the other CPU is in dynticks mode and needs - * to be triggered to reevaluate the timer wheel. - * We are protected against the other CPU fiddling - * with the timer by holding the timer base lock. This also - * makes sure that a CPU on the way to stop its tick can not - * evaluate the timer wheel. - * - * Spare the IPI for deferrable timers on idle targets though. - * The next busy ticks will take care of it. Except full dynticks - * require special care against races with idle_cpu(), lets deal - * with that later. - */ - if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu)) - wake_up_nohz_cpu(cpu); - spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); -- cgit v1.1 From cddd02489f52ccf635ed65931214729a23b93cd6 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Sun, 22 Jun 2014 01:29:15 +0200 Subject: hrtimer: Store cpu-number in struct hrtimer_cpu_base In lowres mode, hrtimers are serviced by the tick instead of a clock event. Now it works well as long as the tick stays periodic but we must also make sure that the hrtimers are serviced in dynticks mode. Part of that job consist in kicking a dynticks hrtimer target in order to make it reconsider the next tick to schedule to correctly handle the hrtimer's expiring time. And that part isn't handled by the hrtimers subsystem. To prepare for fixing this, we need __hrtimer_start_range_ns() to be able to resolve the CPU target associated to a hrtimer's object 'cpu_base' so that the kick can be centralized there. So lets store it in the 'struct hrtimer_cpu_base' to resolve the CPU without overhead. It is set once at CPU's online notification. Signed-off-by: Viresh Kumar Signed-off-by: Frederic Weisbecker Link: http://lkml.kernel.org/r/1403393357-2070-4-git-send-email-fweisbec@gmail.com Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3ab2899..0e32d4e 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1680,6 +1680,7 @@ static void init_hrtimers_cpu(int cpu) timerqueue_init_head(&cpu_base->clock_base[i].active); } + cpu_base->cpu = cpu; hrtimer_init_hres(cpu_base); } -- cgit v1.1 From 49a2a07514a3a2ea4a02482fa60575e106d960f9 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 23 Jun 2014 13:39:37 +0530 Subject: hrtimer: Kick lowres dynticks targets on timer enqueue In lowres mode, hrtimers are serviced by the tick instead of a clock event. It works well as long as the tick stays periodic but we must also make sure that the hrtimers are serviced in dynticks mode targets, pretty much like timer list timers do. Note that all dynticks modes are concerned: get_nohz_timer_target() tries not to return remote idle CPUs but there is nothing to prevent the elected target from entering dynticks idle mode until we lock its base. It's also prefectly legal to enqueue hrtimers on full dynticks CPU. So there are two requirements to correctly handle dynticks: 1) On target's tick stop time, we must not delay the next tick further the next hrtimer. 2) On hrtimer queue time. If the tick of the target is stopped, we must wake up that CPU such that it sees the new hrtimer and recalculate the next tick accordingly. The point 1 is well handled currently through get_nohz_timer_interrupt() and cmp_next_hrtimer_event(). But the point 2 isn't handled at all. Fixing this is easy though as we have the necessary API ready for that. All we need is to call wake_up_nohz_cpu() on a target when a newly enqueued hrtimer requires tick rescheduling, like timer list timer do. Signed-off-by: Viresh Kumar Signed-off-by: Frederic Weisbecker Link: http://lkml.kernel.org/r/3d7ea08ce008698e26bd39fe10f55949391073ab.1403507178.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 0e32d4e..f900747 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1013,14 +1013,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, leftmost = enqueue_hrtimer(timer, new_base); - /* - * Only allow reprogramming if the new base is on this CPU. - * (it might still be on another CPU if the timer was pending) - * - * XXX send_remote_softirq() ? - */ - if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) - && hrtimer_enqueue_reprogram(timer, new_base)) { + if (!leftmost) { + unlock_hrtimer_base(timer, &flags); + return ret; + } + + if (!hrtimer_is_hres_active(timer)) { + /* + * Kick to reschedule the next tick to handle the new timer + * on dynticks target. + */ + wake_up_nohz_cpu(new_base->cpu_base->cpu); + } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) && + hrtimer_enqueue_reprogram(timer, new_base)) { + /* + * Only allow reprogramming if the new base is on this CPU. + * (it might still be on another CPU if the timer was pending) + * + * XXX send_remote_softirq() ? + */ if (wakeup) { /* * We need to drop cpu_base->lock to avoid a -- cgit v1.1 From 9e1e01dd79ac4cf936623399abe57dfba4528ae6 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Sun, 22 Jun 2014 01:29:17 +0200 Subject: hrtimer: Remove hrtimer_enqueue_reprogram() We call hrtimer_enqueue_reprogram() only when we are in high resolution mode now so we don't need to check that again in hrtimer_enqueue_reprogram(). Once the check is removed, hrtimer_enqueue_reprogram() turns to be an useless wrapper over hrtimer_reprogram() and can be dropped. Signed-off-by: Viresh Kumar Signed-off-by: Frederic Weisbecker Link: http://lkml.kernel.org/r/1403393357-2070-6-git-send-email-fweisbec@gmail.com Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f900747..66a6dc1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -602,6 +602,11 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * + * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming + * and no expiry check happens. The timer gets enqueued into the rbtree. The + * reprogramming and expiry check is done in the hrtimer_interrupt or in the + * softirq. + * * Called with interrupts disabled and base->cpu_base.lock held */ static int hrtimer_reprogram(struct hrtimer *timer, @@ -662,18 +667,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) base->hres_active = 0; } -/* - * When High resolution timers are active, try to reprogram. Note, that in case - * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry - * check happens. The timer gets enqueued into the rbtree. The reprogramming - * and expiry check is done in the hrtimer_interrupt or in the softirq. - */ -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return base->cpu_base->hres_active && hrtimer_reprogram(timer, base); -} - static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; @@ -755,8 +748,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline int hrtimer_switch_to_hres(void) { return 0; } static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) +static inline int hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) { return 0; } @@ -1025,7 +1018,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, */ wake_up_nohz_cpu(new_base->cpu_base->cpu); } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) && - hrtimer_enqueue_reprogram(timer, new_base)) { + hrtimer_reprogram(timer, new_base)) { /* * Only allow reprogramming if the new base is on this CPU. * (it might still be on another CPU if the timer was pending) -- cgit v1.1 From d93331965729850303f6111381c1a4a9e9b8ae5a Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Wed, 25 Jun 2014 14:44:53 -0700 Subject: ipv6: Allow accepting RA from local IP addresses. This can be used in virtual networking applications, and may have other uses as well. The option is disabled by default. A specific use case is setting up virtual routers, bridges, and hosts on a single OS without the use of network namespaces or virtual machines. With proper use of ip rules, routing tables, veth interface pairs and/or other virtual interfaces, and applications that can bind to interfaces and/or IP addresses, it is possibly to create one or more virtual routers with multiple hosts attached. The host interfaces can act as IPv6 systems, with radvd running on the ports in the virtual routers. With the option provided in this patch enabled, those hosts can now properly obtain IPv6 addresses from the radvd. Signed-off-by: Ben Greear Signed-off-by: David S. Miller --- kernel/sysctl_binary.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 653cbbd..e4ba9a5 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -522,6 +522,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = { { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" }, {} }; -- cgit v1.1 From 30fe6884021b9fa0124609e898a6341be188eb44 Mon Sep 17 00:00:00 2001 From: Sandeep Tripathy Date: Wed, 2 Jul 2014 15:00:58 +0530 Subject: cpuidle: move idle traces to cpuidle_enter_state() idle_exit event is the first event after a core exits idle state. So this should be traced before local irq is ebabled. Likewise idle_entry is the last event before a core enters idle state. This will ease visualising the cpu idle state from kernel traces. Signed-off-by: Sandeep Tripathy Acked-by: Daniel Lezcano [rjw: Subject, rebase] Signed-off-by: Rafael J. Wysocki --- kernel/sched/idle.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cf009fb..658a58d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -147,8 +147,6 @@ use_default: clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) goto use_default; - trace_cpu_idle_rcuidle(next_state, dev->cpu); - /* * Enter the idle state previously returned by the governor decision. * This function will block until an interrupt occurs and will take @@ -156,8 +154,6 @@ use_default: */ entered_state = cpuidle_enter(drv, dev, next_state); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); - if (broadcast) clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); -- cgit v1.1 From d709f7bcbb3ab01704fa7b37a2e4b981cf3783c1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Jul 2014 23:37:54 +0200 Subject: PM / sleep / irq: Do not suspend wakeup interrupts If an IRQ has been configured for wakeup via enable_irq_wake(), the driver who has done that must be prepared for receiving interrupts after suspend_device_irqs() has returned, so there is no need to "suspend" such IRQs. Moreover, if drivers using enable_irq_wake() actually want to receive interrupts after suspend_device_irqs() has returned, they need to add IRQF_NO_SUSPEND to the IRQ flags while requesting the IRQs, which shouldn't be necessary (it also goes a bit too far, as IRQF_NO_SUSPEND causes the IRQ to be ignored by suspend_device_irqs() all the time regardless of whether or not it has been configured for signaling wakeup). For the above reasons, make __disable_irq() ignore IRQ descriptors with IRQD_WAKEUP_STATE set when its suspend argument is true which effectively causes them to behave like IRQs with IRQF_NO_SUSPEND set. This also allows IRQs configured for wakeup via enable_irq_wake() to work as wakeup interrupts for the "freeze" (suspend-to-idle) sleep mode automatically just like for any other sleep states. Signed-off-by: Rafael J. Wysocki Cc: Li Aubrey Cc: Dmitry Torokhov Cc: One Thousand Gnomes Link: http://lkml.kernel.org/r/4679574.kGUnqAuNl9@vostro.rjw.lan Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3dc6a61..88657d7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -385,7 +385,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) { if (suspend) { - if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) + if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND) || + irqd_has_set(&desc->irq_data, IRQD_WAKEUP_STATE)) return; desc->istate |= IRQS_SUSPENDED; } -- cgit v1.1 From a5152c8a125da3c5e16dc2208dd52e80f0803c5c Mon Sep 17 00:00:00 2001 From: Boris BREZILLON Date: Thu, 10 Jul 2014 19:14:16 +0200 Subject: genirq: generic chip: Export irq_map_generic_chip function Export the generic irq map function in order to provide irq_domain ops with generic mapping and specific of xlate function (needed by the new atmel AIC driver). Signed-off-by: Boris BREZILLON Acked-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1405012462-766-2-git-send-email-boris.brezillon@free-electrons.com Signed-off-by: Jason Cooper --- kernel/irq/generic-chip.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 452d6f2..cf80e7b 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -341,8 +341,8 @@ static struct lock_class_key irq_nested_lock_class; /* * irq_map_generic_chip - Map a generic chip for an irq domain */ -static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, - irq_hw_number_t hw_irq) +int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, + irq_hw_number_t hw_irq) { struct irq_data *data = irq_get_irq_data(virq); struct irq_domain_chip_generic *dgc = d->gc; @@ -394,6 +394,7 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); return 0; } +EXPORT_SYMBOL_GPL(irq_map_generic_chip); struct irq_domain_ops irq_generic_chip_ops = { .map = irq_map_generic_chip, -- cgit v1.1 From 32c4741cb66703a3c282f41d77deff4afd93342a Mon Sep 17 00:00:00 2001 From: Dmitry Kasatkin Date: Tue, 17 Jun 2014 11:56:59 +0300 Subject: KEYS: validate certificate trust only with builtin keys Instead of allowing public keys, with certificates signed by any key on the system trusted keyring, to be added to a trusted keyring, this patch further restricts the certificates to those signed only by builtin keys on the system keyring. This patch defines a new option 'builtin' for the kernel parameter 'keys_ownerid' to allow trust validation using builtin keys. Simplified Mimi's "KEYS: define an owner trusted keyring" patch Changelog v7: - rename builtin_keys to use_builtin_keys Signed-off-by: Dmitry Kasatkin Signed-off-by: Mimi Zohar --- kernel/system_keyring.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c index 52ebc70..875f64e 100644 --- a/kernel/system_keyring.c +++ b/kernel/system_keyring.c @@ -89,6 +89,7 @@ static __init int load_system_certificate_list(void) pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", PTR_ERR(key)); } else { + set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags); pr_notice("Loaded X.509 cert '%s'\n", key_ref_to_ptr(key)->description); key_ref_put(key); -- cgit v1.1 From b4210b810e5040f10a30ba56de6c3faab5c49345 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 25 Jun 2014 15:27:37 +0200 Subject: Add module param type 'ullong' Some driver might want to pass in an 64-bit value, so introduce a module param type 'ullong'. Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Ewan Milne Acked-by: Rusty Russell Signed-off-by: Christoph Hellwig --- kernel/params.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 1e52ca2..34f5270 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -256,6 +256,7 @@ STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); STANDARD_PARAM_DEF(long, long, "%li", kstrtol); STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); +STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); int param_set_charp(const char *val, const struct kernel_param *kp) { -- cgit v1.1 From d78ab02c2c194257a03355fbb79eb721b381d105 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 May 2014 15:02:11 -0700 Subject: seccomp: create internal mode-setting function In preparation for having other callers of the seccomp mode setting logic, split the prctl entry point away from the core logic that performs seccomp mode setting. Signed-off-by: Kees Cook Reviewed-by: Oleg Nesterov Reviewed-by: Andy Lutomirski --- kernel/seccomp.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 301bbc2..afb916c 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -473,7 +473,7 @@ long prctl_get_seccomp(void) } /** - * prctl_set_seccomp: configures current->seccomp.mode + * seccomp_set_mode: internal function for setting seccomp mode * @seccomp_mode: requested mode to use * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER * @@ -486,7 +486,7 @@ long prctl_get_seccomp(void) * * Returns 0 on success or -EINVAL on failure. */ -long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) +static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter) { long ret = -EINVAL; @@ -517,3 +517,15 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) out: return ret; } + +/** + * prctl_set_seccomp: configures current->seccomp.mode + * @seccomp_mode: requested mode to use + * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER + * + * Returns 0 on success or -EINVAL on failure. + */ +long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) +{ + return seccomp_set_mode(seccomp_mode, filter); +} -- cgit v1.1 From 1f41b450416e689b9b7c8bfb750a98604f687a9b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Jun 2014 15:38:02 -0700 Subject: seccomp: extract check/assign mode helpers To support splitting mode 1 from mode 2, extract the mode checking and assignment logic into common functions. Signed-off-by: Kees Cook Reviewed-by: Oleg Nesterov Reviewed-by: Andy Lutomirski --- kernel/seccomp.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index afb916c..9df7def 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -194,7 +194,23 @@ static u32 seccomp_run_filters(int syscall) } return ret; } +#endif /* CONFIG_SECCOMP_FILTER */ +static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) +{ + if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) + return false; + + return true; +} + +static inline void seccomp_assign_mode(unsigned long seccomp_mode) +{ + current->seccomp.mode = seccomp_mode; + set_tsk_thread_flag(current, TIF_SECCOMP); +} + +#ifdef CONFIG_SECCOMP_FILTER /** * seccomp_attach_filter: Attaches a seccomp filter to current. * @fprog: BPF program to install @@ -490,8 +506,7 @@ static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter) { long ret = -EINVAL; - if (current->seccomp.mode && - current->seccomp.mode != seccomp_mode) + if (!seccomp_may_assign_mode(seccomp_mode)) goto out; switch (seccomp_mode) { @@ -512,8 +527,7 @@ static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter) goto out; } - current->seccomp.mode = seccomp_mode; - set_thread_flag(TIF_SECCOMP); + seccomp_assign_mode(seccomp_mode); out: return ret; } -- cgit v1.1 From 3b23dd12846215eff4afb073366b80c0c4d7543e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Jun 2014 15:55:25 -0700 Subject: seccomp: split mode setting routines Separates the two mode setting paths to make things more readable with fewer #ifdefs within function bodies. Signed-off-by: Kees Cook Reviewed-by: Oleg Nesterov Reviewed-by: Andy Lutomirski --- kernel/seccomp.c | 71 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 9df7def..05cac2c 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -489,48 +489,66 @@ long prctl_get_seccomp(void) } /** - * seccomp_set_mode: internal function for setting seccomp mode - * @seccomp_mode: requested mode to use - * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER - * - * This function may be called repeatedly with a @seccomp_mode of - * SECCOMP_MODE_FILTER to install additional filters. Every filter - * successfully installed will be evaluated (in reverse order) for each system - * call the task makes. + * seccomp_set_mode_strict: internal function for setting strict seccomp * * Once current->seccomp.mode is non-zero, it may not be changed. * * Returns 0 on success or -EINVAL on failure. */ -static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter) +static long seccomp_set_mode_strict(void) { + const unsigned long seccomp_mode = SECCOMP_MODE_STRICT; long ret = -EINVAL; if (!seccomp_may_assign_mode(seccomp_mode)) goto out; - switch (seccomp_mode) { - case SECCOMP_MODE_STRICT: - ret = 0; #ifdef TIF_NOTSC - disable_TSC(); + disable_TSC(); #endif - break; + seccomp_assign_mode(seccomp_mode); + ret = 0; + +out: + + return ret; +} + #ifdef CONFIG_SECCOMP_FILTER - case SECCOMP_MODE_FILTER: - ret = seccomp_attach_user_filter(filter); - if (ret) - goto out; - break; -#endif - default: +/** + * seccomp_set_mode_filter: internal function for setting seccomp filter + * @filter: struct sock_fprog containing filter + * + * This function may be called repeatedly to install additional filters. + * Every filter successfully installed will be evaluated (in reverse order) + * for each system call the task makes. + * + * Once current->seccomp.mode is non-zero, it may not be changed. + * + * Returns 0 on success or -EINVAL on failure. + */ +static long seccomp_set_mode_filter(char __user *filter) +{ + const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; + long ret = -EINVAL; + + if (!seccomp_may_assign_mode(seccomp_mode)) + goto out; + + ret = seccomp_attach_user_filter(filter); + if (ret) goto out; - } seccomp_assign_mode(seccomp_mode); out: return ret; } +#else +static inline long seccomp_set_mode_filter(char __user *filter) +{ + return -EINVAL; +} +#endif /** * prctl_set_seccomp: configures current->seccomp.mode @@ -541,5 +559,12 @@ out: */ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) { - return seccomp_set_mode(seccomp_mode, filter); + switch (seccomp_mode) { + case SECCOMP_MODE_STRICT: + return seccomp_set_mode_strict(); + case SECCOMP_MODE_FILTER: + return seccomp_set_mode_filter(filter); + default: + return -EINVAL; + } } -- cgit v1.1 From 48dc92b9fc3926844257316e75ba11eb5c742b2c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Jun 2014 16:08:24 -0700 Subject: seccomp: add "seccomp" syscall This adds the new "seccomp" syscall with both an "operation" and "flags" parameter for future expansion. The third argument is a pointer value, used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...). In addition to the TSYNC flag later in this patch series, there is a non-zero chance that this syscall could be used for configuring a fixed argument area for seccomp-tracer-aware processes to pass syscall arguments in the future. Hence, the use of "seccomp" not simply "seccomp_add_filter" for this syscall. Additionally, this syscall uses operation, flags, and user pointer for arguments because strictly passing arguments via a user pointer would mean seccomp itself would be unable to trivially filter the seccomp syscall itself. Signed-off-by: Kees Cook Reviewed-by: Oleg Nesterov Reviewed-by: Andy Lutomirski --- kernel/seccomp.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++----- kernel/sys_ni.c | 3 +++ 2 files changed, 53 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 05cac2c..f065257 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -18,6 +18,7 @@ #include #include #include +#include /* #define SECCOMP_DEBUG 1 */ @@ -314,7 +315,7 @@ free_prog: * * Returns 0 on success and non-zero otherwise. */ -static long seccomp_attach_user_filter(char __user *user_filter) +static long seccomp_attach_user_filter(const char __user *user_filter) { struct sock_fprog fprog; long ret = -EFAULT; @@ -517,6 +518,7 @@ out: #ifdef CONFIG_SECCOMP_FILTER /** * seccomp_set_mode_filter: internal function for setting seccomp filter + * @flags: flags to change filter behavior * @filter: struct sock_fprog containing filter * * This function may be called repeatedly to install additional filters. @@ -527,11 +529,16 @@ out: * * Returns 0 on success or -EINVAL on failure. */ -static long seccomp_set_mode_filter(char __user *filter) +static long seccomp_set_mode_filter(unsigned int flags, + const char __user *filter) { const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; long ret = -EINVAL; + /* Validate flags. */ + if (flags != 0) + goto out; + if (!seccomp_may_assign_mode(seccomp_mode)) goto out; @@ -544,12 +551,35 @@ out: return ret; } #else -static inline long seccomp_set_mode_filter(char __user *filter) +static inline long seccomp_set_mode_filter(unsigned int flags, + const char __user *filter) { return -EINVAL; } #endif +/* Common entry point for both prctl and syscall. */ +static long do_seccomp(unsigned int op, unsigned int flags, + const char __user *uargs) +{ + switch (op) { + case SECCOMP_SET_MODE_STRICT: + if (flags != 0 || uargs != NULL) + return -EINVAL; + return seccomp_set_mode_strict(); + case SECCOMP_SET_MODE_FILTER: + return seccomp_set_mode_filter(flags, uargs); + default: + return -EINVAL; + } +} + +SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags, + const char __user *, uargs) +{ + return do_seccomp(op, flags, uargs); +} + /** * prctl_set_seccomp: configures current->seccomp.mode * @seccomp_mode: requested mode to use @@ -559,12 +589,27 @@ static inline long seccomp_set_mode_filter(char __user *filter) */ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) { + unsigned int op; + char __user *uargs; + switch (seccomp_mode) { case SECCOMP_MODE_STRICT: - return seccomp_set_mode_strict(); + op = SECCOMP_SET_MODE_STRICT; + /* + * Setting strict mode through prctl always ignored filter, + * so make sure it is always NULL here to pass the internal + * check in do_seccomp(). + */ + uargs = NULL; + break; case SECCOMP_MODE_FILTER: - return seccomp_set_mode_filter(filter); + op = SECCOMP_SET_MODE_FILTER; + uargs = filter; + break; default: return -EINVAL; } + + /* prctl interface doesn't have flags, so they are always zero. */ + return do_seccomp(op, 0, uargs); } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 36441b5..2904a21 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -213,3 +213,6 @@ cond_syscall(compat_sys_open_by_handle_at); /* compare kernel pointers */ cond_syscall(sys_kcmp); + +/* operate on Secure Computing state */ +cond_syscall(sys_seccomp); -- cgit v1.1 From 1d4457f99928a968767f6405b4a1f50845aa15fd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 May 2014 15:23:46 -0700 Subject: sched: move no_new_privs into new atomic flags Since seccomp transitions between threads requires updates to the no_new_privs flag to be atomic, the flag must be part of an atomic flag set. This moves the nnp flag into a separate task field, and introduces accessors. Signed-off-by: Kees Cook Reviewed-by: Oleg Nesterov Reviewed-by: Andy Lutomirski --- kernel/seccomp.c | 2 +- kernel/sys.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index f065257..d259613 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -241,7 +241,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) * This avoids scenarios where unprivileged tasks can affect the * behavior of privileged children. */ - if (!current->no_new_privs && + if (!task_no_new_privs(current) && security_capable_noaudit(current_cred(), current_user_ns(), CAP_SYS_ADMIN) != 0) return -EACCES; diff --git a/kernel/sys.c b/kernel/sys.c index 66a751e..ce81291 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1990,12 +1990,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg2 != 1 || arg3 || arg4 || arg5) return -EINVAL; - current->no_new_privs = 1; + task_set_no_new_privs(current); break; case PR_GET_NO_NEW_PRIVS: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - return current->no_new_privs ? 1 : 0; + return task_no_new_privs(current) ? 1 : 0; case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; -- cgit v1.1 From c8bee430dc52cfca6c1aab27752a89275d78d50f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 27 Jun 2014 15:16:33 -0700 Subject: seccomp: split filter prep from check and apply In preparation for adding seccomp locking, move filter creation away from where it is checked and applied. This will allow for locking where no memory allocation is happening. The validation, filter attachment, and seccomp mode setting can all happen under the future locks. For extreme defensiveness, I've added a BUG_ON check for the calculated size of the buffer allocation in case BPF_MAXINSN ever changes, which shouldn't ever happen. The compiler should actually optimize out this check since the test above it makes it impossible. Signed-off-by: Kees Cook Reviewed-by: Oleg Nesterov Reviewed-by: Andy Lutomirski --- kernel/seccomp.c | 97 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d259613..5812516 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -18,6 +18,7 @@ #include #include #include +#include #include /* #define SECCOMP_DEBUG 1 */ @@ -27,7 +28,6 @@ #include #include #include -#include #include #include @@ -213,27 +213,23 @@ static inline void seccomp_assign_mode(unsigned long seccomp_mode) #ifdef CONFIG_SECCOMP_FILTER /** - * seccomp_attach_filter: Attaches a seccomp filter to current. + * seccomp_prepare_filter: Prepares a seccomp filter for use. * @fprog: BPF program to install * - * Returns 0 on success or an errno on failure. + * Returns filter on success or an ERR_PTR on failure. */ -static long seccomp_attach_filter(struct sock_fprog *fprog) +static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *filter; - unsigned long fp_size = fprog->len * sizeof(struct sock_filter); - unsigned long total_insns = fprog->len; + unsigned long fp_size; struct sock_filter *fp; int new_len; long ret; if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) - return -EINVAL; - - for (filter = current->seccomp.filter; filter; filter = filter->prev) - total_insns += filter->prog->len + 4; /* include a 4 instr penalty */ - if (total_insns > MAX_INSNS_PER_PATH) - return -ENOMEM; + return ERR_PTR(-EINVAL); + BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); + fp_size = fprog->len * sizeof(struct sock_filter); /* * Installing a seccomp filter requires that the task has @@ -244,11 +240,11 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) if (!task_no_new_privs(current) && security_capable_noaudit(current_cred(), current_user_ns(), CAP_SYS_ADMIN) != 0) - return -EACCES; + return ERR_PTR(-EACCES); fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); if (!fp) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* Copy the instructions from fprog. */ ret = -EFAULT; @@ -292,13 +288,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) sk_filter_select_runtime(filter->prog); - /* - * If there is an existing filter, make it the prev and don't drop its - * task reference. - */ - filter->prev = current->seccomp.filter; - current->seccomp.filter = filter; - return 0; + return filter; free_filter_prog: kfree(filter->prog); @@ -306,19 +296,20 @@ free_filter: kfree(filter); free_prog: kfree(fp); - return ret; + return ERR_PTR(ret); } /** - * seccomp_attach_user_filter - attaches a user-supplied sock_fprog + * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog * @user_filter: pointer to the user data containing a sock_fprog. * * Returns 0 on success and non-zero otherwise. */ -static long seccomp_attach_user_filter(const char __user *user_filter) +static struct seccomp_filter * +seccomp_prepare_user_filter(const char __user *user_filter) { struct sock_fprog fprog; - long ret = -EFAULT; + struct seccomp_filter *filter = ERR_PTR(-EFAULT); #ifdef CONFIG_COMPAT if (is_compat_task()) { @@ -331,9 +322,39 @@ static long seccomp_attach_user_filter(const char __user *user_filter) #endif if (copy_from_user(&fprog, user_filter, sizeof(fprog))) goto out; - ret = seccomp_attach_filter(&fprog); + filter = seccomp_prepare_filter(&fprog); out: - return ret; + return filter; +} + +/** + * seccomp_attach_filter: validate and attach filter + * @flags: flags to change filter behavior + * @filter: seccomp filter to add to the current process + * + * Returns 0 on success, -ve on error. + */ +static long seccomp_attach_filter(unsigned int flags, + struct seccomp_filter *filter) +{ + unsigned long total_insns; + struct seccomp_filter *walker; + + /* Validate resulting filter length. */ + total_insns = filter->prog->len; + for (walker = current->seccomp.filter; walker; walker = walker->prev) + total_insns += walker->prog->len + 4; /* 4 instr penalty */ + if (total_insns > MAX_INSNS_PER_PATH) + return -ENOMEM; + + /* + * If there is an existing filter, make it the prev and don't drop its + * task reference. + */ + filter->prev = current->seccomp.filter; + current->seccomp.filter = filter; + + return 0; } /* get_seccomp_filter - increments the reference count of the filter on @tsk */ @@ -346,6 +367,14 @@ void get_seccomp_filter(struct task_struct *tsk) atomic_inc(&orig->usage); } +static inline void seccomp_filter_free(struct seccomp_filter *filter) +{ + if (filter) { + sk_filter_free(filter->prog); + kfree(filter); + } +} + /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ void put_seccomp_filter(struct task_struct *tsk) { @@ -354,8 +383,7 @@ void put_seccomp_filter(struct task_struct *tsk) while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; orig = orig->prev; - sk_filter_free(freeme->prog); - kfree(freeme); + seccomp_filter_free(freeme); } } @@ -533,21 +561,30 @@ static long seccomp_set_mode_filter(unsigned int flags, const char __user *filter) { const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; + struct seccomp_filter *prepared = NULL; long ret = -EINVAL; /* Validate flags. */ if (flags != 0) goto out; + /* Prepare the new filter before holding any locks. */ + prepared = seccomp_prepare_user_filter(filter); + if (IS_ERR(prepared)) + return PTR_ERR(prepared); + if (!seccomp_may_assign_mode(seccomp_mode)) goto out; - ret = seccomp_attach_user_filter(filter); + ret = seccomp_attach_filter(flags, prepared); if (ret) goto out; + /* Do not free the successfully attached filter. */ + prepared = NULL; seccomp_assign_mode(seccomp_mode); out: + seccomp_filter_free(prepared); return ret; } #else -- cgit v1.1 From dbd952127d11bb44a4ea30b08cc60531b6a23d71 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 27 Jun 2014 15:18:48 -0700 Subject: seccomp: introduce writer locking Normally, task_struct.seccomp.filter is only ever read or modified by the task that owns it (current). This property aids in fast access during system call filtering as read access is lockless. Updating the pointer from another task, however, opens up race conditions. To allow cross-thread filter pointer updates, writes to the seccomp fields are now protected by the sighand spinlock (which is shared by all threads in the thread group). Read access remains lockless because pointer updates themselves are atomic. However, writes (or cloning) often entail additional checking (like maximum instruction counts) which require locking to perform safely. In the case of cloning threads, the child is invisible to the system until it enters the task list. To make sure a child can't be cloned from a thread and left in a prior state, seccomp duplication is additionally moved under the sighand lock. Then parent and child are certain have the same seccomp state when they exit the lock. Based on patches by Will Drewry and David Drysdale. Signed-off-by: Kees Cook Reviewed-by: Oleg Nesterov Reviewed-by: Andy Lutomirski --- kernel/fork.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- kernel/seccomp.c | 16 +++++++++++++++- 2 files changed, 63 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 6a13c46..ed4bc33 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -315,6 +315,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) goto free_ti; tsk->stack = ti; +#ifdef CONFIG_SECCOMP + /* + * We must handle setting up seccomp filters once we're under + * the sighand lock in case orig has changed between now and + * then. Until then, filter must be NULL to avoid messing up + * the usage counts on the error path calling free_task. + */ + tsk->seccomp.filter = NULL; +#endif setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); @@ -1081,6 +1090,39 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return 0; } +static void copy_seccomp(struct task_struct *p) +{ +#ifdef CONFIG_SECCOMP + /* + * Must be called with sighand->lock held, which is common to + * all threads in the group. Holding cred_guard_mutex is not + * needed because this new task is not yet running and cannot + * be racing exec. + */ + BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + + /* Ref-count the new filter user, and assign it. */ + get_seccomp_filter(current); + p->seccomp = current->seccomp; + + /* + * Explicitly enable no_new_privs here in case it got set + * between the task_struct being duplicated and holding the + * sighand lock. The seccomp state and nnp must be in sync. + */ + if (task_no_new_privs(current)) + task_set_no_new_privs(p); + + /* + * If the parent gained a seccomp mode after copying thread + * flags and between before we held the sighand lock, we have + * to manually enable the seccomp thread flag here. + */ + if (p->seccomp.mode != SECCOMP_MODE_DISABLED) + set_tsk_thread_flag(p, TIF_SECCOMP); +#endif +} + SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; @@ -1196,7 +1238,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; ftrace_graph_init_task(p); - get_seccomp_filter(p); rt_mutex_init_task(p); @@ -1437,6 +1478,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_lock(¤t->sighand->siglock); /* + * Copy seccomp details explicitly here, in case they were changed + * before holding sighand lock. + */ + copy_seccomp(p); + + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5812516..d5543e7 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -199,6 +199,8 @@ static u32 seccomp_run_filters(int syscall) static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) { + BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) return false; @@ -207,6 +209,8 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) static inline void seccomp_assign_mode(unsigned long seccomp_mode) { + BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + current->seccomp.mode = seccomp_mode; set_tsk_thread_flag(current, TIF_SECCOMP); } @@ -332,6 +336,8 @@ out: * @flags: flags to change filter behavior * @filter: seccomp filter to add to the current process * + * Caller must be holding current->sighand->siglock lock. + * * Returns 0 on success, -ve on error. */ static long seccomp_attach_filter(unsigned int flags, @@ -340,6 +346,8 @@ static long seccomp_attach_filter(unsigned int flags, unsigned long total_insns; struct seccomp_filter *walker; + BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + /* Validate resulting filter length. */ total_insns = filter->prog->len; for (walker = current->seccomp.filter; walker; walker = walker->prev) @@ -529,6 +537,8 @@ static long seccomp_set_mode_strict(void) const unsigned long seccomp_mode = SECCOMP_MODE_STRICT; long ret = -EINVAL; + spin_lock_irq(¤t->sighand->siglock); + if (!seccomp_may_assign_mode(seccomp_mode)) goto out; @@ -539,6 +549,7 @@ static long seccomp_set_mode_strict(void) ret = 0; out: + spin_unlock_irq(¤t->sighand->siglock); return ret; } @@ -566,13 +577,15 @@ static long seccomp_set_mode_filter(unsigned int flags, /* Validate flags. */ if (flags != 0) - goto out; + return -EINVAL; /* Prepare the new filter before holding any locks. */ prepared = seccomp_prepare_user_filter(filter); if (IS_ERR(prepared)) return PTR_ERR(prepared); + spin_lock_irq(¤t->sighand->siglock); + if (!seccomp_may_assign_mode(seccomp_mode)) goto out; @@ -584,6 +597,7 @@ static long seccomp_set_mode_filter(unsigned int flags, seccomp_assign_mode(seccomp_mode); out: + spin_unlock_irq(¤t->sighand->siglock); seccomp_filter_free(prepared); return ret; } -- cgit v1.1 From 3ba2530cc06eb4aee4f1f754f43d781e8a12ee09 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 27 Jun 2014 15:01:35 -0700 Subject: seccomp: allow mode setting across threads This changes the mode setting helper to allow threads to change the seccomp mode from another thread. We must maintain barriers to keep TIF_SECCOMP synchronized with the rest of the seccomp state. Signed-off-by: Kees Cook Reviewed-by: Oleg Nesterov Reviewed-by: Andy Lutomirski --- kernel/seccomp.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d5543e7..9065d2c 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -173,21 +173,24 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) */ static u32 seccomp_run_filters(int syscall) { - struct seccomp_filter *f; + struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); struct seccomp_data sd; u32 ret = SECCOMP_RET_ALLOW; /* Ensure unexpected behavior doesn't result in failing open. */ - if (WARN_ON(current->seccomp.filter == NULL)) + if (unlikely(WARN_ON(f == NULL))) return SECCOMP_RET_KILL; + /* Make sure cross-thread synced filter points somewhere sane. */ + smp_read_barrier_depends(); + populate_seccomp_data(&sd); /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ - for (f = current->seccomp.filter; f; f = f->prev) { + for (; f; f = f->prev) { u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) @@ -207,12 +210,18 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) return true; } -static inline void seccomp_assign_mode(unsigned long seccomp_mode) +static inline void seccomp_assign_mode(struct task_struct *task, + unsigned long seccomp_mode) { - BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + BUG_ON(!spin_is_locked(&task->sighand->siglock)); - current->seccomp.mode = seccomp_mode; - set_tsk_thread_flag(current, TIF_SECCOMP); + task->seccomp.mode = seccomp_mode; + /* + * Make sure TIF_SECCOMP cannot be set before the mode (and + * filter) is set. + */ + smp_mb__before_atomic(); + set_tsk_thread_flag(task, TIF_SECCOMP); } #ifdef CONFIG_SECCOMP_FILTER @@ -435,12 +444,17 @@ static int mode1_syscalls_32[] = { int __secure_computing(int this_syscall) { - int mode = current->seccomp.mode; int exit_sig = 0; int *syscall; u32 ret; - switch (mode) { + /* + * Make sure that any changes to mode from another thread have + * been seen after TIF_SECCOMP was seen. + */ + rmb(); + + switch (current->seccomp.mode) { case SECCOMP_MODE_STRICT: syscall = mode1_syscalls; #ifdef CONFIG_COMPAT @@ -545,7 +559,7 @@ static long seccomp_set_mode_strict(void) #ifdef TIF_NOTSC disable_TSC(); #endif - seccomp_assign_mode(seccomp_mode); + seccomp_assign_mode(current, seccomp_mode); ret = 0; out: @@ -595,7 +609,7 @@ static long seccomp_set_mode_filter(unsigned int flags, /* Do not free the successfully attached filter. */ prepared = NULL; - seccomp_assign_mode(seccomp_mode); + seccomp_assign_mode(current, seccomp_mode); out: spin_unlock_irq(¤t->sighand->siglock); seccomp_filter_free(prepared); -- cgit v1.1 From c2e1f2e30daa551db3c670c0ccfeab20a540b9e1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 5 Jun 2014 00:23:17 -0700 Subject: seccomp: implement SECCOMP_FILTER_FLAG_TSYNC Applying restrictive seccomp filter programs to large or diverse codebases often requires handling threads which may be started early in the process lifetime (e.g., by code that is linked in). While it is possible to apply permissive programs prior to process start up, it is difficult to further restrict the kernel ABI to those threads after that point. This change adds a new seccomp syscall flag to SECCOMP_SET_MODE_FILTER for synchronizing thread group seccomp filters at filter installation time. When calling seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, filter) an attempt will be made to synchronize all threads in current's threadgroup to its new seccomp filter program. This is possible iff all threads are using a filter that is an ancestor to the filter current is attempting to synchronize to. NULL filters (where the task is running as SECCOMP_MODE_NONE) are also treated as ancestors allowing threads to be transitioned into SECCOMP_MODE_FILTER. If prctrl(PR_SET_NO_NEW_PRIVS, ...) has been set on the calling thread, no_new_privs will be set for all synchronized threads too. On success, 0 is returned. On failure, the pid of one of the failing threads will be returned and no filters will have been applied. The race conditions against another thread are: - requesting TSYNC (already handled by sighand lock) - performing a clone (already handled by sighand lock) - changing its filter (already handled by sighand lock) - calling exec (handled by cred_guard_mutex) The clone case is assisted by the fact that new threads will have their seccomp state duplicated from their parent before appearing on the tasklist. Holding cred_guard_mutex means that seccomp filters cannot be assigned while in the middle of another thread's exec (potentially bypassing no_new_privs or similar). The call to de_thread() may kill threads waiting for the mutex. Changes across threads to the filter pointer includes a barrier. Based on patches by Will Drewry. Suggested-by: Julien Tinnes Signed-off-by: Kees Cook Reviewed-by: Oleg Nesterov Reviewed-by: Andy Lutomirski --- kernel/seccomp.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 134 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 9065d2c..74f4601 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -26,6 +26,7 @@ #ifdef CONFIG_SECCOMP_FILTER #include #include +#include #include #include #include @@ -225,6 +226,114 @@ static inline void seccomp_assign_mode(struct task_struct *task, } #ifdef CONFIG_SECCOMP_FILTER +/* Returns 1 if the parent is an ancestor of the child. */ +static int is_ancestor(struct seccomp_filter *parent, + struct seccomp_filter *child) +{ + /* NULL is the root ancestor. */ + if (parent == NULL) + return 1; + for (; child; child = child->prev) + if (child == parent) + return 1; + return 0; +} + +/** + * seccomp_can_sync_threads: checks if all threads can be synchronized + * + * Expects sighand and cred_guard_mutex locks to be held. + * + * Returns 0 on success, -ve on error, or the pid of a thread which was + * either not in the correct seccomp mode or it did not have an ancestral + * seccomp filter. + */ +static inline pid_t seccomp_can_sync_threads(void) +{ + struct task_struct *thread, *caller; + + BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); + BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + + /* Validate all threads being eligible for synchronization. */ + caller = current; + for_each_thread(caller, thread) { + pid_t failed; + + /* Skip current, since it is initiating the sync. */ + if (thread == caller) + continue; + + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED || + (thread->seccomp.mode == SECCOMP_MODE_FILTER && + is_ancestor(thread->seccomp.filter, + caller->seccomp.filter))) + continue; + + /* Return the first thread that cannot be synchronized. */ + failed = task_pid_vnr(thread); + /* If the pid cannot be resolved, then return -ESRCH */ + if (unlikely(WARN_ON(failed == 0))) + failed = -ESRCH; + return failed; + } + + return 0; +} + +/** + * seccomp_sync_threads: sets all threads to use current's filter + * + * Expects sighand and cred_guard_mutex locks to be held, and for + * seccomp_can_sync_threads() to have returned success already + * without dropping the locks. + * + */ +static inline void seccomp_sync_threads(void) +{ + struct task_struct *thread, *caller; + + BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); + BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + + /* Synchronize all threads. */ + caller = current; + for_each_thread(caller, thread) { + /* Skip current, since it needs no changes. */ + if (thread == caller) + continue; + + /* Get a task reference for the new leaf node. */ + get_seccomp_filter(caller); + /* + * Drop the task reference to the shared ancestor since + * current's path will hold a reference. (This also + * allows a put before the assignment.) + */ + put_seccomp_filter(thread); + smp_store_release(&thread->seccomp.filter, + caller->seccomp.filter); + /* + * Opt the other thread into seccomp if needed. + * As threads are considered to be trust-realm + * equivalent (see ptrace_may_access), it is safe to + * allow one thread to transition the other. + */ + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { + /* + * Don't let an unprivileged task work around + * the no_new_privs restriction by creating + * a thread that sets it up, enters seccomp, + * then dies. + */ + if (task_no_new_privs(caller)) + task_set_no_new_privs(thread); + + seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); + } + } +} + /** * seccomp_prepare_filter: Prepares a seccomp filter for use. * @fprog: BPF program to install @@ -364,6 +473,15 @@ static long seccomp_attach_filter(unsigned int flags, if (total_insns > MAX_INSNS_PER_PATH) return -ENOMEM; + /* If thread sync has been requested, check that it is possible. */ + if (flags & SECCOMP_FILTER_FLAG_TSYNC) { + int ret; + + ret = seccomp_can_sync_threads(); + if (ret) + return ret; + } + /* * If there is an existing filter, make it the prev and don't drop its * task reference. @@ -371,6 +489,10 @@ static long seccomp_attach_filter(unsigned int flags, filter->prev = current->seccomp.filter; current->seccomp.filter = filter; + /* Now that the new filter is in place, synchronize to all threads. */ + if (flags & SECCOMP_FILTER_FLAG_TSYNC) + seccomp_sync_threads(); + return 0; } @@ -590,7 +712,7 @@ static long seccomp_set_mode_filter(unsigned int flags, long ret = -EINVAL; /* Validate flags. */ - if (flags != 0) + if (flags & ~SECCOMP_FILTER_FLAG_MASK) return -EINVAL; /* Prepare the new filter before holding any locks. */ @@ -598,6 +720,14 @@ static long seccomp_set_mode_filter(unsigned int flags, if (IS_ERR(prepared)) return PTR_ERR(prepared); + /* + * Make sure we cannot change seccomp or nnp state via TSYNC + * while another thread is in the middle of calling exec. + */ + if (flags & SECCOMP_FILTER_FLAG_TSYNC && + mutex_lock_killable(¤t->signal->cred_guard_mutex)) + goto out_free; + spin_lock_irq(¤t->sighand->siglock); if (!seccomp_may_assign_mode(seccomp_mode)) @@ -612,6 +742,9 @@ static long seccomp_set_mode_filter(unsigned int flags, seccomp_assign_mode(current, seccomp_mode); out: spin_unlock_irq(¤t->sighand->siglock); + if (flags & SECCOMP_FILTER_FLAG_TSYNC) + mutex_unlock(¤t->signal->cred_guard_mutex); +out_free: seccomp_filter_free(prepared); return ret; } -- cgit v1.1 From d431cbc53cb787a7f82d7d2fe0af65156db4d27a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 15 Jul 2014 22:02:11 +0200 Subject: PM / sleep: Simplify sleep states sysfs interface code Simplify the sleep states sysfs interface /sys/power/state code by redefining pm_states[] as an array of pointers to constant strings such that only the entries corresponding to valid states are set. Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 21 +++++++++++---------- kernel/power/power.h | 7 +------ kernel/power/suspend.c | 32 +++++++++++++------------------- kernel/power/suspend_test.c | 12 ++++++------ 4 files changed, 31 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 8e90f33..d57f66a 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -296,8 +296,8 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) - if (pm_states[i].state) - s += sprintf(s,"%s ", pm_states[i].label); + if (pm_states[i]) + s += sprintf(s,"%s ", pm_states[i]); #endif if (hibernation_available()) @@ -311,8 +311,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND - suspend_state_t state = PM_SUSPEND_MIN; - struct pm_sleep_state *s; + suspend_state_t state; #endif char *p; int len; @@ -325,10 +324,12 @@ static suspend_state_t decode_state(const char *buf, size_t n) return PM_SUSPEND_MAX; #ifdef CONFIG_SUSPEND - for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) - if (s->state && len == strlen(s->label) - && !strncmp(buf, s->label, len)) - return s->state; + for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { + const char *label = pm_states[state]; + + if (label && len == strlen(label) && !strncmp(buf, label, len)) + return state; + } #endif return PM_SUSPEND_ON; @@ -446,8 +447,8 @@ static ssize_t autosleep_show(struct kobject *kobj, #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) - return sprintf(buf, "%s\n", pm_states[state].state ? - pm_states[state].label : "error"); + return sprintf(buf, "%s\n", pm_states[state] ? + pm_states[state] : "error"); #endif #ifdef CONFIG_HIBERNATION return sprintf(buf, "disk\n"); diff --git a/kernel/power/power.h b/kernel/power/power.h index c60f13b..5d49dca 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -178,13 +178,8 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, unsigned int, char *); #ifdef CONFIG_SUSPEND -struct pm_sleep_state { - const char *label; - suspend_state_t state; -}; - /* kernel/power/suspend.c */ -extern struct pm_sleep_state pm_states[]; +extern const char *pm_states[]; extern int suspend_devices_and_enter(suspend_state_t state); #else /* !CONFIG_SUSPEND */ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ed35a47..83f5b3e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -31,11 +31,8 @@ #include "power.h" -struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { - [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, - [PM_SUSPEND_STANDBY] = { .label = "standby", }, - [PM_SUSPEND_MEM] = { .label = "mem", }, -}; +static const char *pm_labels[] = { "mem", "standby", "freeze", }; +const char *pm_states[PM_SUSPEND_MAX]; static const struct platform_suspend_ops *suspend_ops; static const struct platform_freeze_ops *freeze_ops; @@ -97,10 +94,7 @@ static bool relative_states; static int __init sleep_states_setup(char *str) { relative_states = !strncmp(str, "1", 1); - if (relative_states) { - pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE; - pm_states[PM_SUSPEND_FREEZE].state = 0; - } + pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; return 1; } @@ -113,20 +107,20 @@ __setup("relative_sleep_states=", sleep_states_setup); void suspend_set_ops(const struct platform_suspend_ops *ops) { suspend_state_t i; - int j = PM_SUSPEND_MAX - 1; + int j = 0; lock_system_sleep(); suspend_ops = ops; for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) - if (valid_state(i)) - pm_states[j--].state = i; - else if (!relative_states) - pm_states[j--].state = 0; + if (valid_state(i)) { + pm_states[i] = pm_labels[j++]; + } else if (!relative_states) { + pm_states[i] = NULL; + j++; + } - pm_states[j--].state = PM_SUSPEND_FREEZE; - while (j >= PM_SUSPEND_MIN) - pm_states[j--].state = 0; + pm_states[PM_SUSPEND_FREEZE] = pm_labels[j]; unlock_system_sleep(); } @@ -395,7 +389,7 @@ static int enter_state(suspend_state_t state) printk("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); + pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); error = suspend_prepare(state); if (error) goto Unlock; @@ -404,7 +398,7 @@ static int enter_state(suspend_state_t state) goto Finish; trace_suspend_resume(TPS("suspend_enter"), state, false); - pr_debug("PM: Entering %s sleep\n", pm_states[state].label); + pr_debug("PM: Entering %s sleep\n", pm_states[state]); pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); pm_restore_gfp_mask(); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 269b097..2f524928 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) } if (state == PM_SUSPEND_MEM) { - printk(info_test, pm_states[state].label); + printk(info_test, pm_states[state]); status = pm_suspend(state); if (status == -ENODEV) state = PM_SUSPEND_STANDBY; } if (state == PM_SUSPEND_STANDBY) { - printk(info_test, pm_states[state].label); + printk(info_test, pm_states[state]); status = pm_suspend(state); } if (status < 0) @@ -141,8 +141,8 @@ static int __init setup_test_suspend(char *value) /* "=mem" ==> "mem" */ value++; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) - if (!strcmp(pm_states[i].label, value)) { - test_state = pm_states[i].state; + if (!strcmp(pm_states[i], value)) { + test_state = i; return 0; } @@ -162,8 +162,8 @@ static int __init test_suspend(void) /* PM is initialized by now; is that state testable? */ if (test_state == PM_SUSPEND_ON) goto done; - if (!pm_states[test_state].state) { - printk(warn_bad_state, pm_states[test_state].label); + if (!pm_states[test_state]) { + printk(warn_bad_state, pm_states[test_state]); goto done; } -- cgit v1.1 From 78c5e0bb145d3eac719fcad1ac1df763a71cf632 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 22 Jul 2014 15:43:12 +0100 Subject: PM / OPP: Remove ARCH_HAS_OPP Since the OPP layer is a kernel library which has been converted to be directly selectable by its callers rather than user selectable and requiring architectures to enable it explicitly the ARCH_HAS_OPP symbol has become redundant and can be removed. Do so. Signed-off-by: Mark Brown Reviewed-by: Viresh Kumar Acked-by: Nishanth Menon Acked-by: Rob Herring Acked-by: Shawn Guo Acked-by: Simon Horman Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9a83d78..e4e4121 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -253,9 +253,6 @@ config APM_EMULATION anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). -config ARCH_HAS_OPP - bool - config PM_OPP bool ---help--- -- cgit v1.1 From 8490fdf923fc6cf6c31a53b73cafdf582a9642f0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 23 Jul 2014 00:57:53 +0200 Subject: PM / sleep: Move platform suspend operations to separate functions After the introduction of freeze_ops it makes more sense to move all of the platform suspend operations to separate functions that each will do all of the necessary checks and choose the right callback to execute istead of doing all that in the core code which makes it generally harder to follow. Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 120 +++++++++++++++++++++++++++++++------------------ 1 file changed, 77 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 83f5b3e..9a071be 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -36,12 +36,6 @@ const char *pm_states[PM_SUSPEND_MAX]; static const struct platform_suspend_ops *suspend_ops; static const struct platform_freeze_ops *freeze_ops; - -static bool need_suspend_ops(suspend_state_t state) -{ - return state > PM_SUSPEND_FREEZE; -} - static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); static bool suspend_freeze_wake; @@ -139,6 +133,65 @@ int suspend_valid_only_mem(suspend_state_t state) } EXPORT_SYMBOL_GPL(suspend_valid_only_mem); +static bool sleep_state_supported(suspend_state_t state) +{ + return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter); +} + +static int platform_suspend_prepare(suspend_state_t state) +{ + return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ? + suspend_ops->prepare() : 0; +} + +static int platform_suspend_prepare_late(suspend_state_t state) +{ + return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? + suspend_ops->prepare_late() : 0; +} + +static void platform_suspend_wake(suspend_state_t state) +{ + if (state != PM_SUSPEND_FREEZE && suspend_ops->wake) + suspend_ops->wake(); +} + +static void platform_suspend_finish(suspend_state_t state) +{ + if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) + suspend_ops->finish(); +} + +static int platform_suspend_begin(suspend_state_t state) +{ + if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) + return freeze_ops->begin(); + else if (suspend_ops->begin) + return suspend_ops->begin(state); + else + return 0; +} + +static void platform_suspend_end(suspend_state_t state) +{ + if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) + freeze_ops->end(); + else if (suspend_ops->end) + suspend_ops->end(); +} + +static void platform_suspend_recover(suspend_state_t state) +{ + if (state != PM_SUSPEND_FREEZE && suspend_ops->recover) + suspend_ops->recover(); +} + +static bool platform_suspend_again(suspend_state_t state) +{ + return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ? + suspend_ops->suspend_again() : false; +} + static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG @@ -162,7 +215,7 @@ static int suspend_prepare(suspend_state_t state) { int error; - if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) + if (!sleep_state_supported(state)) return -EPERM; pm_prepare_console(); @@ -208,23 +261,18 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) { int error; - if (need_suspend_ops(state) && suspend_ops->prepare) { - error = suspend_ops->prepare(); - if (error) - goto Platform_finish; - } + error = platform_suspend_prepare(state); + if (error) + goto Platform_finish; error = dpm_suspend_end(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); goto Platform_finish; } - - if (need_suspend_ops(state) && suspend_ops->prepare_late) { - error = suspend_ops->prepare_late(); - if (error) - goto Platform_wake; - } + error = platform_suspend_prepare_late(state); + if (error) + goto Platform_wake; if (suspend_test(TEST_PLATFORM)) goto Platform_wake; @@ -272,15 +320,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) ftrace_start(); Platform_wake: - if (need_suspend_ops(state) && suspend_ops->wake) - suspend_ops->wake(); - + platform_suspend_wake(state); dpm_resume_start(PMSG_RESUME); Platform_finish: - if (need_suspend_ops(state) && suspend_ops->finish) - suspend_ops->finish(); - + platform_suspend_finish(state); return error; } @@ -293,18 +337,13 @@ int suspend_devices_and_enter(suspend_state_t state) int error; bool wakeup = false; - if (need_suspend_ops(state) && !suspend_ops) + if (!sleep_state_supported(state)) return -ENOSYS; - if (need_suspend_ops(state) && suspend_ops->begin) { - error = suspend_ops->begin(state); - if (error) - goto Close; - } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) { - error = freeze_ops->begin(); - if (error) - goto Close; - } + error = platform_suspend_begin(state); + if (error) + goto Close; + suspend_console(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); @@ -318,25 +357,20 @@ int suspend_devices_and_enter(suspend_state_t state) do { error = suspend_enter(state, &wakeup); - } while (!error && !wakeup && need_suspend_ops(state) - && suspend_ops->suspend_again && suspend_ops->suspend_again()); + } while (!error && !wakeup && platform_suspend_again(state)); Resume_devices: suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); resume_console(); - Close: - if (need_suspend_ops(state) && suspend_ops->end) - suspend_ops->end(); - else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) - freeze_ops->end(); + Close: + platform_suspend_end(state); return error; Recover_platform: - if (need_suspend_ops(state) && suspend_ops->recover) - suspend_ops->recover(); + platform_suspend_recover(state); goto Resume_devices; } -- cgit v1.1 From 28cb5ef16e578bbca0a562b09f12c8c98ca92720 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 23 Jul 2014 01:00:36 +0200 Subject: PM: Create PM workqueue if runtime PM is not configured too The PM workqueue is going to be used by ACPI PM notify handlers regardless of whether or not runtime PM is configured, so move it out of #ifdef CONFIG_PM_RUNTIME. Do that in three places in the ACPI device PM code. Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 8e90f33..a18efed 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -615,7 +615,6 @@ static struct attribute_group attr_group = { .attrs = g, }; -#ifdef CONFIG_PM_RUNTIME struct workqueue_struct *pm_wq; EXPORT_SYMBOL_GPL(pm_wq); @@ -625,9 +624,6 @@ static int __init pm_start_workqueue(void) return pm_wq ? 0 : -ENOMEM; } -#else -static inline int pm_start_workqueue(void) { return 0; } -#endif static int __init pm_init(void) { -- cgit v1.1 From e704f93af5a083c07b8f722672d63a1d908daf55 Mon Sep 17 00:00:00 2001 From: David Riley Date: Mon, 16 Jun 2014 14:58:32 -0700 Subject: kernel: time: Add udelay_test module to validate udelay Create a module that allows udelay() to be executed to ensure that it is delaying at least as long as requested (with a little bit of error allowed). There are some configurations which don't have reliably udelay due to using a loop delay with cpufreq changes which should use a counter time based delay instead. This test aims to identify those configurations where timing is unreliable. Signed-off-by: David Riley Signed-off-by: John Stultz --- kernel/time/Makefile | 2 + kernel/time/udelay_test.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+) create mode 100644 kernel/time/udelay_test.c (limited to 'kernel') diff --git a/kernel/time/Makefile b/kernel/time/Makefile index e59ce8b..7347426 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o +obj-$(CONFIG_TEST_UDELAY) += udelay_test.o $(obj)/time.o: $(obj)/timeconst.h @@ -29,3 +30,4 @@ quiet_cmd_bc = BC $@ targets += timeconst.h $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE $(call if_changed,bc) + diff --git a/kernel/time/udelay_test.c b/kernel/time/udelay_test.c new file mode 100644 index 0000000..e622ba3 --- /dev/null +++ b/kernel/time/udelay_test.c @@ -0,0 +1,168 @@ +/* + * udelay() test kernel module + * + * Test is executed by writing and reading to /sys/kernel/debug/udelay_test + * Tests are configured by writing: USECS ITERATIONS + * Tests are executed by reading from the same file. + * Specifying usecs of 0 or negative values will run multiples tests. + * + * Copyright (C) 2014 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include + +#define DEFAULT_ITERATIONS 100 + +#define DEBUGFS_FILENAME "udelay_test" + +static DEFINE_MUTEX(udelay_test_lock); +static struct dentry *udelay_test_debugfs_file; +static int udelay_test_usecs; +static int udelay_test_iterations = DEFAULT_ITERATIONS; + +static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters) +{ + int min = 0, max = 0, fail_count = 0; + uint64_t sum = 0; + uint64_t avg; + int i; + /* Allow udelay to be up to 0.5% fast */ + int allowed_error_ns = usecs * 5; + + for (i = 0; i < iters; ++i) { + struct timespec ts1, ts2; + int time_passed; + + ktime_get_ts(&ts1); + udelay(usecs); + ktime_get_ts(&ts2); + time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1); + + if (i == 0 || time_passed < min) + min = time_passed; + if (i == 0 || time_passed > max) + max = time_passed; + if ((time_passed + allowed_error_ns) / 1000 < usecs) + ++fail_count; + WARN_ON(time_passed < 0); + sum += time_passed; + } + + avg = sum; + do_div(avg, iters); + seq_printf(s, "%d usecs x %d: exp=%d allowed=%d min=%d avg=%lld max=%d", + usecs, iters, usecs * 1000, + (usecs * 1000) - allowed_error_ns, min, avg, max); + if (fail_count) + seq_printf(s, " FAIL=%d", fail_count); + seq_puts(s, "\n"); + + return 0; +} + +static int udelay_test_show(struct seq_file *s, void *v) +{ + int usecs; + int iters; + int ret = 0; + + mutex_lock(&udelay_test_lock); + usecs = udelay_test_usecs; + iters = udelay_test_iterations; + mutex_unlock(&udelay_test_lock); + + if (usecs > 0 && iters > 0) { + return udelay_test_single(s, usecs, iters); + } else if (usecs == 0) { + struct timespec ts; + + ktime_get_ts(&ts); + seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n", + loops_per_jiffy, ts.tv_sec, ts.tv_nsec); + seq_puts(s, "usage:\n"); + seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); + seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); + } + + return ret; +} + +static int udelay_test_open(struct inode *inode, struct file *file) +{ + return single_open(file, udelay_test_show, inode->i_private); +} + +static ssize_t udelay_test_write(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + char lbuf[32]; + int ret; + int usecs; + int iters; + + if (count >= sizeof(lbuf)) + return -EINVAL; + + if (copy_from_user(lbuf, buf, count)) + return -EFAULT; + lbuf[count] = '\0'; + + ret = sscanf(lbuf, "%d %d", &usecs, &iters); + if (ret < 1) + return -EINVAL; + else if (ret < 2) + iters = DEFAULT_ITERATIONS; + + mutex_lock(&udelay_test_lock); + udelay_test_usecs = usecs; + udelay_test_iterations = iters; + mutex_unlock(&udelay_test_lock); + + return count; +} + +static const struct file_operations udelay_test_debugfs_ops = { + .owner = THIS_MODULE, + .open = udelay_test_open, + .read = seq_read, + .write = udelay_test_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init udelay_test_init(void) +{ + mutex_lock(&udelay_test_lock); + udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME, + S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops); + mutex_unlock(&udelay_test_lock); + + return 0; +} + +module_init(udelay_test_init); + +static void __exit udelay_test_exit(void) +{ + mutex_lock(&udelay_test_lock); + debugfs_remove(udelay_test_debugfs_file); + mutex_unlock(&udelay_test_lock); +} + +module_exit(udelay_test_exit); + +MODULE_AUTHOR("David Riley "); +MODULE_LICENSE("GPL"); -- cgit v1.1 From e06fde37b860f5030e93475a2a95857af7ad13e1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:03:50 +0000 Subject: timekeeping: Simplify arch_gettimeoffset() Provide a default stub function instead of having the extra conditional. Cuts binary size on a m68k build by ~100 bytes. Signed-off-by: Thomas Gleixner Acked-by: Geert Uytterhoeven Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 32d8d6a..908861c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -153,16 +153,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) /* Timekeeper helper functions. */ #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET -u32 (*arch_gettimeoffset)(void); - -u32 get_arch_timeoffset(void) -{ - if (likely(arch_gettimeoffset)) - return arch_gettimeoffset(); - return 0; -} +static u32 default_arch_gettimeoffset(void) { return 0; } +u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; #else -static inline u32 get_arch_timeoffset(void) { return 0; } +static inline u32 arch_gettimeoffset(void) { return 0; } #endif static inline s64 timekeeping_get_ns(struct timekeeper *tk) @@ -182,7 +176,7 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk) nsec >>= tk->shift; /* If arch requires, add in get_arch_timeoffset() */ - return nsec + get_arch_timeoffset(); + return nsec + arch_gettimeoffset(); } static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) @@ -202,7 +196,7 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); /* If arch requires, add in get_arch_timeoffset() */ - return nsec + get_arch_timeoffset(); + return nsec + arch_gettimeoffset(); } static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); @@ -282,7 +276,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk->xtime_nsec += cycle_delta * tk->mult; /* If arch requires, add in get_arch_timeoffset() */ - tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift; + tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; tk_normalize_xtime(tk); -- cgit v1.1 From 76f4108892d9a9e3408bba839914f97a54086a6f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 16 Jul 2014 21:03:52 +0000 Subject: hrtimer: Cleanup hrtimer accessors to the timekepeing state Rather then having two similar but totally different implementations that provide timekeeping state to the hrtimer code, try to unify the two implementations to be more simliar. Thus this clarifies ktime_get_update_offsets to ktime_get_update_offsets_now and changes get_xtime... to ktime_get_update_offsets_tick. Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/hrtimer.c | 19 ++++++++----------- kernel/time/timekeeping.c | 36 +++++++++++++++++++++++------------- 2 files changed, 31 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 66a6dc1..2f4ef8a 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -114,21 +114,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) */ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { - ktime_t xtim, mono, boot; - struct timespec xts, tom, slp; - s32 tai_offset; + ktime_t xtim, mono, boot, tai; + ktime_t off_real, off_boot, off_tai; - get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); - tai_offset = timekeeping_get_tai_offset(); + mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); + boot = ktime_add(mono, off_boot); + xtim = ktime_add(mono, off_real); + tai = ktime_add(xtim, off_tai); - xtim = timespec_to_ktime(xts); - mono = ktime_add(xtim, timespec_to_ktime(tom)); - boot = ktime_add(mono, timespec_to_ktime(slp)); base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; - base->clock_base[HRTIMER_BASE_TAI].softirq_time = - ktime_add(xtim, ktime_set(tai_offset, 0)); + base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai; } /* @@ -673,7 +670,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - return ktime_get_update_offsets(offs_real, offs_boot, offs_tai); + return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); } /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 908861c..b94fa36 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1581,29 +1581,39 @@ void do_timer(unsigned long ticks) } /** - * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, - * and sleep offsets. - * @xtim: pointer to timespec to be set with xtime - * @wtom: pointer to timespec to be set with wall_to_monotonic - * @sleep: pointer to timespec to be set with time in suspend + * ktime_get_update_offsets_tick - hrtimer helper + * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset + * @offs_tai: pointer to storage for monotonic -> clock tai offset + * + * Returns monotonic time at last tick and various offsets */ -void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, - struct timespec *wtom, struct timespec *sleep) +ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, + ktime_t *offs_tai) { struct timekeeper *tk = &timekeeper; - unsigned long seq; + struct timespec ts; + ktime_t now; + unsigned int seq; do { seq = read_seqcount_begin(&timekeeper_seq); - *xtim = tk_xtime(tk); - *wtom = tk->wall_to_monotonic; - *sleep = tk->total_sleep_time; + + ts = tk_xtime(tk); + + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + *offs_tai = tk->offs_tai; } while (read_seqcount_retry(&timekeeper_seq, seq)); + + now = ktime_set(ts.tv_sec, ts.tv_nsec); + now = ktime_sub(now, *offs_real); + return now; } #ifdef CONFIG_HIGH_RES_TIMERS /** - * ktime_get_update_offsets - hrtimer helper + * ktime_get_update_offsets_now - hrtimer helper * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset @@ -1611,7 +1621,7 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, * Returns current monotonic time and updates the offsets * Called from hrtimer_interrupt() or retrigger_next_event() */ -ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, +ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &timekeeper; -- cgit v1.1 From 24e4a8c3e8868874835b0f1ad6dd417341e99822 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 16 Jul 2014 21:03:53 +0000 Subject: ktime: Kill non-scalar ktime_t implementation for 2038 The non-scalar ktime_t implementation is basically a timespec which has to be changed to support dates past 2038 on 32bit systems. This patch removes the non-scalar ktime_t implementation, forcing the scalar s64 nanosecond version on all architectures. This may have additional performance overhead on some 32bit systems when converting between ktime_t and timespec structures, however the majority of 32bit systems (arm and i386) were already using scalar ktime_t, so no performance regressions will be seen on those platforms. On affected platforms, I'm open to finding optimizations, including avoiding converting to timespecs where possible. [ tglx: We can now cleanup the ktime_t.tv64 mess, but thats a different issue and we can throw a coccinelle script at it ] Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/Kconfig | 4 ---- kernel/time/hrtimer.c | 54 ----------------------------------------------- kernel/time/timekeeping.c | 7 ++---- 3 files changed, 2 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f448513..feccfd8 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -20,10 +20,6 @@ config GENERIC_TIME_VSYSCALL config GENERIC_TIME_VSYSCALL_OLD bool -# ktime_t scalar 64bit nsec representation -config KTIME_SCALAR - bool - # Old style timekeeping config ARCH_USES_GETTIMEOFFSET bool diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 2f4ef8a..19f2110 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -261,60 +261,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) * too large for inlining: */ #if BITS_PER_LONG < 64 -# ifndef CONFIG_KTIME_SCALAR -/** - * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable - * @kt: addend - * @nsec: the scalar nsec value to add - * - * Returns the sum of kt and nsec in ktime_t format - */ -ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) -{ - ktime_t tmp; - - if (likely(nsec < NSEC_PER_SEC)) { - tmp.tv64 = nsec; - } else { - unsigned long rem = do_div(nsec, NSEC_PER_SEC); - - /* Make sure nsec fits into long */ - if (unlikely(nsec > KTIME_SEC_MAX)) - return (ktime_t){ .tv64 = KTIME_MAX }; - - tmp = ktime_set((long)nsec, rem); - } - - return ktime_add(kt, tmp); -} - -EXPORT_SYMBOL_GPL(ktime_add_ns); - -/** - * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable - * @kt: minuend - * @nsec: the scalar nsec value to subtract - * - * Returns the subtraction of @nsec from @kt in ktime_t format - */ -ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec) -{ - ktime_t tmp; - - if (likely(nsec < NSEC_PER_SEC)) { - tmp.tv64 = nsec; - } else { - unsigned long rem = do_div(nsec, NSEC_PER_SEC); - - tmp = ktime_set((long)nsec, rem); - } - - return ktime_sub(kt, tmp); -} - -EXPORT_SYMBOL_GPL(ktime_sub_ns); -# endif /* !CONFIG_KTIME_SCALAR */ - /* * Divide a ktime value by a nanosecond value */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b94fa36..cafef24 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -344,11 +344,8 @@ ktime_t ktime_get(void) nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; } while (read_seqcount_retry(&timekeeper_seq, seq)); - /* - * Use ktime_set/ktime_add_ns to create a proper ktime on - * 32-bit architectures without CONFIG_KTIME_SCALAR. - */ - return ktime_add_ns(ktime_set(secs, 0), nsecs); + + return ktime_set(secs, nsecs); } EXPORT_SYMBOL_GPL(ktime_get); -- cgit v1.1 From 166afb64511eef08e13331b970c44fe91cea45ef Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:03:55 +0000 Subject: ktime: Sanitize ktime_to_us/ms conversion With the plain nanoseconds based ktime_t we can simply use ktime_divns() instead of going through loops and hoops of timespec/timeval conversion. Reported-by: John Stultz Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/hrtimer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 19f2110..64843a8 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -280,6 +280,7 @@ u64 ktime_divns(const ktime_t kt, s64 div) return dclc; } +EXPORT_SYMBOL_GPL(ktime_divns); #endif /* BITS_PER_LONG >= 64 */ /* -- cgit v1.1 From 49cd6f869984692547c57621bf42697aaa7f5622 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 16 Jul 2014 21:03:59 +0000 Subject: time: More core infrastructure for timespec64 Helper and conversion functions for timespec64. Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/time.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 7c7964c..e8121a6 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -420,6 +420,68 @@ struct timeval ns_to_timeval(const s64 nsec) } EXPORT_SYMBOL(ns_to_timeval); +#if BITS_PER_LONG == 32 +/** + * set_normalized_timespec - set timespec sec and nsec parts and normalize + * + * @ts: pointer to timespec variable to be set + * @sec: seconds to set + * @nsec: nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of + * 0 <= tv_nsec < NSEC_PER_SEC + * For negative values only the tv_sec field is negative ! + */ +void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec) +{ + while (nsec >= NSEC_PER_SEC) { + /* + * The following asm() prevents the compiler from + * optimising this loop into a modulo operation. See + * also __iter_div_u64_rem() in include/linux/time.h + */ + asm("" : "+rm"(nsec)); + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + asm("" : "+rm"(nsec)); + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} +EXPORT_SYMBOL(set_normalized_timespec64); + +/** + * ns_to_timespec64 - Convert nanoseconds to timespec64 + * @nsec: the nanoseconds value to be converted + * + * Returns the timespec64 representation of the nsec parameter. + */ +struct timespec64 ns_to_timespec64(const s64 nsec) +{ + struct timespec64 ts; + s32 rem; + + if (!nsec) + return (struct timespec64) {0, 0}; + + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; + + return ts; +} +EXPORT_SYMBOL(ns_to_timespec64); +#endif /* * When we convert to jiffies then we interpret incoming values * the following way: -- cgit v1.1 From 7d489d15ce4be5310ca60e5896df833f9b3b4088 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 16 Jul 2014 21:04:01 +0000 Subject: timekeeping: Convert timekeeping core to use timespec64s Convert the core timekeeping logic to use timespec64s. This moves the 2038 issues out of the core logic and into all of the accessor functions. Future changes will need to push the timespec64s out to all timekeeping users, but that can be done interface by interface. Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/ntp.c | 8 +- kernel/time/ntp_internal.h | 2 +- kernel/time/timekeeping.c | 172 +++++++++++++++++++++---------------- kernel/time/timekeeping_debug.c | 2 +- kernel/time/timekeeping_internal.h | 2 +- 5 files changed, 104 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 33db43a..6e87df9 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -531,7 +531,7 @@ void ntp_notify_cmos_timer(void) { } /* * Propagate a new txc->status value into the NTP state: */ -static inline void process_adj_status(struct timex *txc, struct timespec *ts) +static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; @@ -554,7 +554,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) static inline void process_adjtimex_modes(struct timex *txc, - struct timespec *ts, + struct timespec64 *ts, s32 *time_tai) { if (txc->modes & ADJ_STATUS) @@ -640,7 +640,7 @@ int ntp_validate_timex(struct timex *txc) * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) +int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) { int result; @@ -684,7 +684,7 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) /* fill PPS status fields */ pps_fill_timex(txc); - txc->time.tv_sec = ts->tv_sec; + txc->time.tv_sec = (time_t)ts->tv_sec; txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 1950cb4..bbd102a 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -7,6 +7,6 @@ extern void ntp_clear(void); extern u64 ntp_tick_length(void); extern int second_overflow(unsigned long secs); extern int ntp_validate_timex(struct timex *); -extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); +extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); extern void __hardpps(const struct timespec *, const struct timespec *); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cafef24..84a2075 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -51,43 +51,43 @@ static inline void tk_normalize_xtime(struct timekeeper *tk) } } -static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) +static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; } -static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) +static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec += ts->tv_sec; tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; tk_normalize_xtime(tk); } -static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) +static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) { - struct timespec tmp; + struct timespec64 tmp; /* * Verify consistency of: offset_real = -wall_to_monotonic * before modifying anything */ - set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec, + set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec, -tk->wall_to_monotonic.tv_nsec); - WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64); + WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64); tk->wall_to_monotonic = wtm; - set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); - tk->offs_real = timespec_to_ktime(tmp); + set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); + tk->offs_real = timespec64_to_ktime(tmp); tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); } -static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) +static void tk_set_sleep_time(struct timekeeper *tk, struct timespec64 t) { /* Verify consistency before modifying */ - WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64); + WARN_ON_ONCE(tk->offs_boot.tv64 != timespec64_to_ktime(tk->total_sleep_time).tv64); tk->total_sleep_time = t; - tk->offs_boot = timespec_to_ktime(t); + tk->offs_boot = timespec64_to_ktime(t); } /** @@ -281,7 +281,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk_normalize_xtime(tk); nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); - timespec_add_ns(&tk->raw_time, nsec); + timespec64_add_ns(&tk->raw_time, nsec); } /** @@ -360,7 +360,7 @@ EXPORT_SYMBOL_GPL(ktime_get); void ktime_get_ts(struct timespec *ts) { struct timekeeper *tk = &timekeeper; - struct timespec tomono; + struct timespec64 ts64, tomono; s64 nsec; unsigned int seq; @@ -368,15 +368,16 @@ void ktime_get_ts(struct timespec *ts) do { seq = read_seqcount_begin(&timekeeper_seq); - ts->tv_sec = tk->xtime_sec; + ts64.tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; } while (read_seqcount_retry(&timekeeper_seq, seq)); - ts->tv_sec += tomono.tv_sec; - ts->tv_nsec = 0; - timespec_add_ns(ts, nsec + tomono.tv_nsec); + ts64.tv_sec += tomono.tv_sec; + ts64.tv_nsec = 0; + timespec64_add_ns(&ts64, nsec + tomono.tv_nsec); + *ts = timespec64_to_timespec(ts64); } EXPORT_SYMBOL_GPL(ktime_get_ts); @@ -390,6 +391,7 @@ EXPORT_SYMBOL_GPL(ktime_get_ts); void timekeeping_clocktai(struct timespec *ts) { struct timekeeper *tk = &timekeeper; + struct timespec64 ts64; unsigned long seq; u64 nsecs; @@ -398,13 +400,14 @@ void timekeeping_clocktai(struct timespec *ts) do { seq = read_seqcount_begin(&timekeeper_seq); - ts->tv_sec = tk->xtime_sec + tk->tai_offset; + ts64.tv_sec = tk->xtime_sec + tk->tai_offset; nsecs = timekeeping_get_ns(tk); } while (read_seqcount_retry(&timekeeper_seq, seq)); - ts->tv_nsec = 0; - timespec_add_ns(ts, nsecs); + ts64.tv_nsec = 0; + timespec64_add_ns(&ts64, nsecs); + *ts = timespec64_to_timespec(ts64); } EXPORT_SYMBOL(timekeeping_clocktai); @@ -446,7 +449,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) do { seq = read_seqcount_begin(&timekeeper_seq); - *ts_raw = tk->raw_time; + *ts_raw = timespec64_to_timespec(tk->raw_time); ts_real->tv_sec = tk->xtime_sec; ts_real->tv_nsec = 0; @@ -487,7 +490,7 @@ EXPORT_SYMBOL(do_gettimeofday); int do_settimeofday(const struct timespec *tv) { struct timekeeper *tk = &timekeeper; - struct timespec ts_delta, xt; + struct timespec64 ts_delta, xt, tmp; unsigned long flags; if (!timespec_valid_strict(tv)) @@ -502,9 +505,10 @@ int do_settimeofday(const struct timespec *tv) ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; - tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); - tk_set_xtime(tk, tv); + tmp = timespec_to_timespec64(*tv); + tk_set_xtime(tk, &tmp); timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); @@ -528,26 +532,28 @@ int timekeeping_inject_offset(struct timespec *ts) { struct timekeeper *tk = &timekeeper; unsigned long flags; - struct timespec tmp; + struct timespec64 ts64, tmp; int ret = 0; if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; + ts64 = timespec_to_timespec64(*ts); + raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); /* Make sure the proposed value is valid */ - tmp = timespec_add(tk_xtime(tk), *ts); - if (!timespec_valid_strict(&tmp)) { + tmp = timespec64_add(tk_xtime(tk), ts64); + if (!timespec64_valid_strict(&tmp)) { ret = -EINVAL; goto error; } - tk_xtime_add(tk, ts); - tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); + tk_xtime_add(tk, &ts64); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64)); error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); @@ -691,17 +697,19 @@ EXPORT_SYMBOL_GPL(ktime_get_real); void getrawmonotonic(struct timespec *ts) { struct timekeeper *tk = &timekeeper; + struct timespec64 ts64; unsigned long seq; s64 nsecs; do { seq = read_seqcount_begin(&timekeeper_seq); nsecs = timekeeping_get_ns_raw(tk); - *ts = tk->raw_time; + ts64 = tk->raw_time; } while (read_seqcount_retry(&timekeeper_seq, seq)); - timespec_add_ns(ts, nsecs); + timespec64_add_ns(&ts64, nsecs); + *ts = timespec64_to_timespec(ts64); } EXPORT_SYMBOL(getrawmonotonic); @@ -781,11 +789,12 @@ void __init timekeeping_init(void) struct timekeeper *tk = &timekeeper; struct clocksource *clock; unsigned long flags; - struct timespec now, boot, tmp; - - read_persistent_clock(&now); + struct timespec64 now, boot, tmp; + struct timespec ts; - if (!timespec_valid_strict(&now)) { + read_persistent_clock(&ts); + now = timespec_to_timespec64(ts); + if (!timespec64_valid_strict(&now)) { pr_warn("WARNING: Persistent clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); now.tv_sec = 0; @@ -793,8 +802,9 @@ void __init timekeeping_init(void) } else if (now.tv_sec || now.tv_nsec) persistent_clock_exist = true; - read_boot_clock(&boot); - if (!timespec_valid_strict(&boot)) { + read_boot_clock(&ts); + boot = timespec_to_timespec64(ts); + if (!timespec64_valid_strict(&boot)) { pr_warn("WARNING: Boot clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); boot.tv_sec = 0; @@ -816,7 +826,7 @@ void __init timekeeping_init(void) if (boot.tv_sec == 0 && boot.tv_nsec == 0) boot = tk_xtime(tk); - set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); + set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); tk_set_wall_to_mono(tk, tmp); tmp.tv_sec = 0; @@ -830,7 +840,7 @@ void __init timekeeping_init(void) } /* time in seconds when suspend began */ -static struct timespec timekeeping_suspend_time; +static struct timespec64 timekeeping_suspend_time; /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval @@ -840,17 +850,17 @@ static struct timespec timekeeping_suspend_time; * adds the sleep offset to the timekeeping variables. */ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, - struct timespec *delta) + struct timespec64 *delta) { - if (!timespec_valid_strict(delta)) { + if (!timespec64_valid_strict(delta)) { printk_deferred(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; } tk_xtime_add(tk, delta); - tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); - tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); + tk_set_sleep_time(tk, timespec64_add(tk->total_sleep_time, *delta)); tk_debug_account_sleep_time(delta); } @@ -867,6 +877,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, void timekeeping_inject_sleeptime(struct timespec *delta) { struct timekeeper *tk = &timekeeper; + struct timespec64 tmp; unsigned long flags; /* @@ -881,7 +892,8 @@ void timekeeping_inject_sleeptime(struct timespec *delta) timekeeping_forward_now(tk); - __timekeeping_inject_sleeptime(tk, delta); + tmp = timespec_to_timespec64(*delta); + __timekeeping_inject_sleeptime(tk, &tmp); timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); @@ -904,11 +916,13 @@ static void timekeeping_resume(void) struct timekeeper *tk = &timekeeper; struct clocksource *clock = tk->clock; unsigned long flags; - struct timespec ts_new, ts_delta; + struct timespec64 ts_new, ts_delta; + struct timespec tmp; cycle_t cycle_now, cycle_delta; bool suspendtime_found = false; - read_persistent_clock(&ts_new); + read_persistent_clock(&tmp); + ts_new = timespec_to_timespec64(tmp); clockevents_resume(); clocksource_resume(); @@ -951,10 +965,10 @@ static void timekeeping_resume(void) } nsec += ((u64) cycle_delta * mult) >> shift; - ts_delta = ns_to_timespec(nsec); + ts_delta = ns_to_timespec64(nsec); suspendtime_found = true; - } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { - ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); + } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { + ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); suspendtime_found = true; } @@ -981,10 +995,12 @@ static int timekeeping_suspend(void) { struct timekeeper *tk = &timekeeper; unsigned long flags; - struct timespec delta, delta_delta; - static struct timespec old_delta; + struct timespec64 delta, delta_delta; + static struct timespec64 old_delta; + struct timespec tmp; - read_persistent_clock(&timekeeping_suspend_time); + read_persistent_clock(&tmp); + timekeeping_suspend_time = timespec_to_timespec64(tmp); /* * On some systems the persistent_clock can not be detected at @@ -1005,8 +1021,8 @@ static int timekeeping_suspend(void) * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ - delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); - delta_delta = timespec_sub(delta, old_delta); + delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); + delta_delta = timespec64_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* * if delta_delta is too large, assume time correction @@ -1016,7 +1032,7 @@ static int timekeeping_suspend(void) } else { /* Otherwise try to adjust old_system to compensate */ timekeeping_suspend_time = - timespec_add(timekeeping_suspend_time, delta_delta); + timespec64_add(timekeeping_suspend_time, delta_delta); } timekeeping_update(tk, TK_MIRROR); @@ -1253,14 +1269,14 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) /* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->xtime_sec); if (unlikely(leap)) { - struct timespec ts; + struct timespec64 ts; tk->xtime_sec += leap; ts.tv_sec = leap; ts.tv_nsec = 0; tk_set_wall_to_mono(tk, - timespec_sub(tk->wall_to_monotonic, ts)); + timespec64_sub(tk->wall_to_monotonic, ts)); __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); @@ -1469,7 +1485,7 @@ EXPORT_SYMBOL_GPL(getboottime); void get_monotonic_boottime(struct timespec *ts) { struct timekeeper *tk = &timekeeper; - struct timespec tomono, sleep; + struct timespec64 tomono, sleep, ret; s64 nsec; unsigned int seq; @@ -1477,16 +1493,17 @@ void get_monotonic_boottime(struct timespec *ts) do { seq = read_seqcount_begin(&timekeeper_seq); - ts->tv_sec = tk->xtime_sec; + ret.tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; sleep = tk->total_sleep_time; } while (read_seqcount_retry(&timekeeper_seq, seq)); - ts->tv_sec += tomono.tv_sec + sleep.tv_sec; - ts->tv_nsec = 0; - timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec); + ret.tv_sec += tomono.tv_sec + sleep.tv_sec; + ret.tv_nsec = 0; + timespec64_add_ns(&ret, nsec + tomono.tv_nsec + sleep.tv_nsec); + *ts = timespec64_to_timespec(ret); } EXPORT_SYMBOL_GPL(get_monotonic_boottime); @@ -1514,8 +1531,11 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime); void monotonic_to_bootbased(struct timespec *ts) { struct timekeeper *tk = &timekeeper; + struct timespec64 ts64; - *ts = timespec_add(*ts, tk->total_sleep_time); + ts64 = timespec_to_timespec64(*ts); + ts64 = timespec64_add(ts64, tk->total_sleep_time); + *ts = timespec64_to_timespec(ts64); } EXPORT_SYMBOL_GPL(monotonic_to_bootbased); @@ -1531,13 +1551,13 @@ struct timespec __current_kernel_time(void) { struct timekeeper *tk = &timekeeper; - return tk_xtime(tk); + return timespec64_to_timespec(tk_xtime(tk)); } struct timespec current_kernel_time(void) { struct timekeeper *tk = &timekeeper; - struct timespec now; + struct timespec64 now; unsigned long seq; do { @@ -1546,14 +1566,14 @@ struct timespec current_kernel_time(void) now = tk_xtime(tk); } while (read_seqcount_retry(&timekeeper_seq, seq)); - return now; + return timespec64_to_timespec(now); } EXPORT_SYMBOL(current_kernel_time); struct timespec get_monotonic_coarse(void) { struct timekeeper *tk = &timekeeper; - struct timespec now, mono; + struct timespec64 now, mono; unsigned long seq; do { @@ -1563,9 +1583,10 @@ struct timespec get_monotonic_coarse(void) mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&timekeeper_seq, seq)); - set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, + set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); - return now; + + return timespec64_to_timespec(now); } /* @@ -1589,7 +1610,7 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &timekeeper; - struct timespec ts; + struct timespec64 ts; ktime_t now; unsigned int seq; @@ -1597,7 +1618,6 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, seq = read_seqcount_begin(&timekeeper_seq); ts = tk_xtime(tk); - *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; @@ -1650,14 +1670,14 @@ ktime_t ktime_get_monotonic_offset(void) { struct timekeeper *tk = &timekeeper; unsigned long seq; - struct timespec wtom; + struct timespec64 wtom; do { seq = read_seqcount_begin(&timekeeper_seq); wtom = tk->wall_to_monotonic; } while (read_seqcount_retry(&timekeeper_seq, seq)); - return timespec_to_ktime(wtom); + return timespec64_to_ktime(wtom); } EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); @@ -1668,7 +1688,8 @@ int do_adjtimex(struct timex *txc) { struct timekeeper *tk = &timekeeper; unsigned long flags; - struct timespec ts; + struct timespec64 ts; + struct timespec tmp; s32 orig_tai, tai; int ret; @@ -1688,7 +1709,8 @@ int do_adjtimex(struct timex *txc) return ret; } - getnstimeofday(&ts); + getnstimeofday(&tmp); + ts = timespec_to_timespec64(tmp); raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&timekeeper_seq); diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 4d54f97..f6bd652 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -67,7 +67,7 @@ static int __init tk_debug_sleep_time_init(void) } late_initcall(tk_debug_sleep_time_init); -void tk_debug_account_sleep_time(struct timespec *t) +void tk_debug_account_sleep_time(struct timespec64 *t) { sleep_time_bin[fls(t->tv_sec)]++; } diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 13323ea..e3d28ad 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -6,7 +6,7 @@ #include #ifdef CONFIG_DEBUG_FS -extern void tk_debug_account_sleep_time(struct timespec *t); +extern void tk_debug_account_sleep_time(struct timespec64 *t); #else #define tk_debug_account_sleep_time(x) #endif -- cgit v1.1 From 8b094cd03b4a3793220d8d8d86a173bfea8c285b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:02 +0000 Subject: time: Consolidate the time accessor prototypes Right now we have time related prototypes in 3 different header files. Move it to a single timekeeping header file and move the core internal stuff into a core private header. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/hrtimer.c | 2 ++ kernel/time/posix-timers.c | 2 ++ kernel/time/tick-internal.h | 2 ++ kernel/time/time.c | 1 + kernel/time/timekeeping.h | 20 ++++++++++++++++++++ 5 files changed, 27 insertions(+) create mode 100644 kernel/time/timekeeping.h (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 64843a8..1c2fe7d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -54,6 +54,8 @@ #include +#include "timekeeping.h" + /* * The timer bases: * diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 424c2d4..42b463a 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -49,6 +49,8 @@ #include #include +#include "timekeeping.h" + /* * Management arrays for POSIX timers. Timers are now kept in static hash table * with 512 entries. diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 7ab92b1..c19c1d8 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -4,6 +4,8 @@ #include #include +#include "timekeeping.h" + extern seqlock_t jiffies_lock; #define CS_NAME_LEN 32 diff --git a/kernel/time/time.c b/kernel/time/time.c index e8121a6..278c63c 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -42,6 +42,7 @@ #include #include "timeconst.h" +#include "timekeeping.h" /* * The timezone where the local system is located. Used as a default by some diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h new file mode 100644 index 0000000..adc1fc9 --- /dev/null +++ b/kernel/time/timekeeping.h @@ -0,0 +1,20 @@ +#ifndef _KERNEL_TIME_TIMEKEEPING_H +#define _KERNEL_TIME_TIMEKEEPING_H +/* + * Internal interfaces for kernel/time/ + */ +extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); +extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); + +extern int timekeeping_valid_for_hres(void); +extern u64 timekeeping_max_deferment(void); +extern int timekeeping_inject_offset(struct timespec *ts); +extern s32 timekeeping_get_tai_offset(void); +extern void timekeeping_set_tai_offset(s32 tai_offset); +extern void timekeeping_clocktai(struct timespec *ts); + +#endif -- cgit v1.1 From d6d29896c665dfd50e6e0be7a9039901640433a3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:04 +0000 Subject: timekeeping: Provide timespec64 based interfaces To convert callers of the core code to timespec64 we need to provide the proper interfaces. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/ntp.c | 7 ++++--- kernel/time/timekeeping.c | 47 ++++++++++++++++++++++------------------------- 2 files changed, 26 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 6e87df9..87a346f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -466,7 +466,8 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); static void sync_cmos_clock(struct work_struct *work) { - struct timespec now, next; + struct timespec64 now; + struct timespec next; int fail = 1; /* @@ -485,9 +486,9 @@ static void sync_cmos_clock(struct work_struct *work) return; } - getnstimeofday(&now); + getnstimeofday64(&now); if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { - struct timespec adjust = now; + struct timespec adjust = timespec64_to_timespec(now); fail = -ENODEV; if (persistent_clock_is_local) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 84a2075..3210c9e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -285,13 +285,13 @@ static void timekeeping_forward_now(struct timekeeper *tk) } /** - * __getnstimeofday - Returns the time of day in a timespec. + * __getnstimeofday64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * * Updates the time of day in the timespec. * Returns 0 on success, or -ve when suspended (timespec will be undefined). */ -int __getnstimeofday(struct timespec *ts) +int __getnstimeofday64(struct timespec64 *ts) { struct timekeeper *tk = &timekeeper; unsigned long seq; @@ -306,7 +306,7 @@ int __getnstimeofday(struct timespec *ts) } while (read_seqcount_retry(&timekeeper_seq, seq)); ts->tv_nsec = 0; - timespec_add_ns(ts, nsecs); + timespec64_add_ns(ts, nsecs); /* * Do not bail out early, in case there were callers still using @@ -316,19 +316,19 @@ int __getnstimeofday(struct timespec *ts) return -EAGAIN; return 0; } -EXPORT_SYMBOL(__getnstimeofday); +EXPORT_SYMBOL(__getnstimeofday64); /** - * getnstimeofday - Returns the time of day in a timespec. + * getnstimeofday64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * * Returns the time of day in a timespec (WARN if suspended). */ -void getnstimeofday(struct timespec *ts) +void getnstimeofday64(struct timespec64 *ts) { - WARN_ON(__getnstimeofday(ts)); + WARN_ON(__getnstimeofday64(ts)); } -EXPORT_SYMBOL(getnstimeofday); +EXPORT_SYMBOL(getnstimeofday64); ktime_t ktime_get(void) { @@ -350,17 +350,17 @@ ktime_t ktime_get(void) EXPORT_SYMBOL_GPL(ktime_get); /** - * ktime_get_ts - get the monotonic clock in timespec format + * ktime_get_ts64 - get the monotonic clock in timespec64 format * @ts: pointer to timespec variable * * The function calculates the monotonic clock from the realtime * clock and the wall_to_monotonic offset and stores the result * in normalized timespec format in the variable pointed to by @ts. */ -void ktime_get_ts(struct timespec *ts) +void ktime_get_ts64(struct timespec64 *ts) { struct timekeeper *tk = &timekeeper; - struct timespec64 ts64, tomono; + struct timespec64 tomono; s64 nsec; unsigned int seq; @@ -368,18 +368,17 @@ void ktime_get_ts(struct timespec *ts) do { seq = read_seqcount_begin(&timekeeper_seq); - ts64.tv_sec = tk->xtime_sec; + ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; } while (read_seqcount_retry(&timekeeper_seq, seq)); - ts64.tv_sec += tomono.tv_sec; - ts64.tv_nsec = 0; - timespec64_add_ns(&ts64, nsec + tomono.tv_nsec); - *ts = timespec64_to_timespec(ts64); + ts->tv_sec += tomono.tv_sec; + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsec + tomono.tv_nsec); } -EXPORT_SYMBOL_GPL(ktime_get_ts); +EXPORT_SYMBOL_GPL(ktime_get_ts64); /** @@ -473,9 +472,9 @@ EXPORT_SYMBOL(getnstime_raw_and_real); */ void do_gettimeofday(struct timeval *tv) { - struct timespec now; + struct timespec64 now; - getnstimeofday(&now); + getnstimeofday64(&now); tv->tv_sec = now.tv_sec; tv->tv_usec = now.tv_nsec/1000; } @@ -680,11 +679,11 @@ int timekeeping_notify(struct clocksource *clock) */ ktime_t ktime_get_real(void) { - struct timespec now; + struct timespec64 now; - getnstimeofday(&now); + getnstimeofday64(&now); - return timespec_to_ktime(now); + return timespec64_to_ktime(now); } EXPORT_SYMBOL_GPL(ktime_get_real); @@ -1689,7 +1688,6 @@ int do_adjtimex(struct timex *txc) struct timekeeper *tk = &timekeeper; unsigned long flags; struct timespec64 ts; - struct timespec tmp; s32 orig_tai, tai; int ret; @@ -1709,8 +1707,7 @@ int do_adjtimex(struct timex *txc) return ret; } - getnstimeofday(&tmp); - ts = timespec_to_timespec64(tmp); + getnstimeofday64(&ts); raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&timekeeper_seq); -- cgit v1.1 From c905fae43f61c2b4508fc01722e8db61b6b8ac0b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:05 +0000 Subject: timekeeper: Move tk_xtime to core code No users outside of the core. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 70 +++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3210c9e..983d67b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -51,6 +51,15 @@ static inline void tk_normalize_xtime(struct timekeeper *tk) } } +static inline struct timespec64 tk_xtime(struct timekeeper *tk) +{ + struct timespec64 ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); + return ts; +} + static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; @@ -199,6 +208,40 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) return nsec + arch_gettimeoffset(); } +#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD + +static inline void update_vsyscall(struct timekeeper *tk) +{ + struct timespec xt; + + xt = tk_xtime(tk); + update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); +} + +static inline void old_vsyscall_fixup(struct timekeeper *tk) +{ + s64 remainder; + + /* + * Store only full nanoseconds into xtime_nsec after rounding + * it up and add the remainder to the error difference. + * XXX - This is necessary to avoid small 1ns inconsistnecies caused + * by truncating the remainder in vsyscalls. However, it causes + * additional work to be done in timekeeping_adjust(). Once + * the vsyscall implementations are converted to use xtime_nsec + * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD + * users are removed, this can be killed. + */ + remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); + tk->xtime_nsec -= remainder; + tk->xtime_nsec += 1ULL << tk->shift; + tk->ntp_error += remainder << tk->ntp_error_shift; + tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; +} +#else +#define old_vsyscall_fixup(tk) +#endif + static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) @@ -1330,33 +1373,6 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, return offset; } -#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD -static inline void old_vsyscall_fixup(struct timekeeper *tk) -{ - s64 remainder; - - /* - * Store only full nanoseconds into xtime_nsec after rounding - * it up and add the remainder to the error difference. - * XXX - This is necessary to avoid small 1ns inconsistnecies caused - * by truncating the remainder in vsyscalls. However, it causes - * additional work to be done in timekeeping_adjust(). Once - * the vsyscall implementations are converted to use xtime_nsec - * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD - * users are removed, this can be killed. - */ - remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); - tk->xtime_nsec -= remainder; - tk->xtime_nsec += 1ULL << tk->shift; - tk->ntp_error += remainder << tk->ntp_error_shift; - tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; -} -#else -#define old_vsyscall_fixup(tk) -#endif - - - /** * update_wall_time - Uses the current clocksource to increment the wall time * -- cgit v1.1 From 3fdb14fd1df70325e1e91e1203a699a4803ed741 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:07 +0000 Subject: timekeeping: Cache optimize struct timekeeper struct timekeeper is quite badly sorted for the hot readout path. Most time access functions need to load two cache lines. Rearrange it so ktime_get() and getnstimeofday() are happy with a single cache line. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 185 ++++++++++++++++++++++++---------------------- 1 file changed, 97 insertions(+), 88 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 983d67b..7ca150a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -32,9 +32,16 @@ #define TK_MIRROR (1 << 1) #define TK_CLOCK_WAS_SET (1 << 2) -static struct timekeeper timekeeper; +/* + * The most important data for readout fits into a single 64 byte + * cache line. + */ +static struct { + seqcount_t seq; + struct timekeeper timekeeper; +} tk_core ____cacheline_aligned; + static DEFINE_RAW_SPINLOCK(timekeeper_lock); -static seqcount_t timekeeper_seq; static struct timekeeper shadow_timekeeper; /* flag for if timekeeping is suspended */ @@ -254,7 +261,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) */ int pvclock_gtod_register_notifier(struct notifier_block *nb) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; int ret; @@ -295,7 +302,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); if (action & TK_MIRROR) - memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); + memcpy(&shadow_timekeeper, &tk_core.timekeeper, + sizeof(tk_core.timekeeper)); } /** @@ -336,17 +344,17 @@ static void timekeeping_forward_now(struct timekeeper *tk) */ int __getnstimeofday64(struct timespec64 *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; s64 nsecs = 0; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; nsecs = timekeeping_get_ns(tk); - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); @@ -375,18 +383,18 @@ EXPORT_SYMBOL(getnstimeofday64); ktime_t ktime_get(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; s64 secs, nsecs; WARN_ON(timekeeping_suspended); do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_set(secs, nsecs); } @@ -402,7 +410,7 @@ EXPORT_SYMBOL_GPL(ktime_get); */ void ktime_get_ts64(struct timespec64 *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 tomono; s64 nsec; unsigned int seq; @@ -410,12 +418,12 @@ void ktime_get_ts64(struct timespec64 *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_sec += tomono.tv_sec; ts->tv_nsec = 0; @@ -432,7 +440,7 @@ EXPORT_SYMBOL_GPL(ktime_get_ts64); */ void timekeeping_clocktai(struct timespec *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts64; unsigned long seq; u64 nsecs; @@ -440,12 +448,12 @@ void timekeeping_clocktai(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); ts64.tv_sec = tk->xtime_sec + tk->tai_offset; nsecs = timekeeping_get_ns(tk); - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); ts64.tv_nsec = 0; timespec64_add_ns(&ts64, nsecs); @@ -482,14 +490,14 @@ EXPORT_SYMBOL(ktime_get_clocktai); */ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; s64 nsecs_raw, nsecs_real; WARN_ON_ONCE(timekeeping_suspended); do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); *ts_raw = timespec64_to_timespec(tk->raw_time); ts_real->tv_sec = tk->xtime_sec; @@ -498,7 +506,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) nsecs_raw = timekeeping_get_ns_raw(tk); nsecs_real = timekeeping_get_ns(tk); - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); timespec_add_ns(ts_raw, nsecs_raw); timespec_add_ns(ts_real, nsecs_real); @@ -531,7 +539,7 @@ EXPORT_SYMBOL(do_gettimeofday); */ int do_settimeofday(const struct timespec *tv) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts_delta, xt, tmp; unsigned long flags; @@ -539,7 +547,7 @@ int do_settimeofday(const struct timespec *tv) return -EINVAL; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); @@ -554,7 +562,7 @@ int do_settimeofday(const struct timespec *tv) timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ @@ -572,7 +580,7 @@ EXPORT_SYMBOL(do_settimeofday); */ int timekeeping_inject_offset(struct timespec *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; struct timespec64 ts64, tmp; int ret = 0; @@ -583,7 +591,7 @@ int timekeeping_inject_offset(struct timespec *ts) ts64 = timespec_to_timespec64(*ts); raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); @@ -600,7 +608,7 @@ int timekeeping_inject_offset(struct timespec *ts) error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ @@ -617,14 +625,14 @@ EXPORT_SYMBOL(timekeeping_inject_offset); */ s32 timekeeping_get_tai_offset(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; s32 ret; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); ret = tk->tai_offset; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } @@ -645,14 +653,14 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) */ void timekeeping_set_tai_offset(s32 tai_offset) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); __timekeeping_set_tai_offset(tk, tai_offset); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); clock_was_set(); } @@ -664,14 +672,14 @@ void timekeeping_set_tai_offset(s32 tai_offset) */ static int change_clocksource(void *data) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *new, *old; unsigned long flags; new = (struct clocksource *) data; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); /* @@ -691,7 +699,7 @@ static int change_clocksource(void *data) } timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return 0; @@ -706,7 +714,7 @@ static int change_clocksource(void *data) */ int timekeeping_notify(struct clocksource *clock) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; if (tk->clock == clock) return 0; @@ -738,17 +746,17 @@ EXPORT_SYMBOL_GPL(ktime_get_real); */ void getrawmonotonic(struct timespec *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts64; unsigned long seq; s64 nsecs; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); nsecs = timekeeping_get_ns_raw(tk); ts64 = tk->raw_time; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); timespec64_add_ns(&ts64, nsecs); *ts = timespec64_to_timespec(ts64); @@ -760,16 +768,16 @@ EXPORT_SYMBOL(getrawmonotonic); */ int timekeeping_valid_for_hres(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; int ret; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } @@ -779,16 +787,16 @@ int timekeeping_valid_for_hres(void) */ u64 timekeeping_max_deferment(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; u64 ret; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); ret = tk->clock->max_idle_ns; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } @@ -828,7 +836,7 @@ void __weak read_boot_clock(struct timespec *ts) */ void __init timekeeping_init(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *clock; unsigned long flags; struct timespec64 now, boot, tmp; @@ -854,7 +862,7 @@ void __init timekeeping_init(void) } raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); ntp_init(); clock = clocksource_default_clock(); @@ -875,9 +883,10 @@ void __init timekeeping_init(void) tmp.tv_nsec = 0; tk_set_sleep_time(tk, tmp); - memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); + memcpy(&shadow_timekeeper, &tk_core.timekeeper, + sizeof(tk_core.timekeeper)); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } @@ -918,7 +927,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, */ void timekeeping_inject_sleeptime(struct timespec *delta) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 tmp; unsigned long flags; @@ -930,7 +939,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) return; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); @@ -939,7 +948,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ @@ -955,7 +964,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) */ static void timekeeping_resume(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *clock = tk->clock; unsigned long flags; struct timespec64 ts_new, ts_delta; @@ -970,7 +979,7 @@ static void timekeeping_resume(void) clocksource_resume(); raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); /* * After system resumes, we need to calculate the suspended time and @@ -1022,7 +1031,7 @@ static void timekeeping_resume(void) tk->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); touch_softlockup_watchdog(); @@ -1035,7 +1044,7 @@ static void timekeeping_resume(void) static int timekeeping_suspend(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; struct timespec64 delta, delta_delta; static struct timespec64 old_delta; @@ -1053,7 +1062,7 @@ static int timekeeping_suspend(void) persistent_clock_exist = true; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); timekeeping_suspended = 1; @@ -1078,7 +1087,7 @@ static int timekeeping_suspend(void) } timekeeping_update(tk, TK_MIRROR); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); @@ -1380,7 +1389,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, void update_wall_time(void) { struct clocksource *clock; - struct timekeeper *real_tk = &timekeeper; + struct timekeeper *real_tk = &tk_core.timekeeper; struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; int shift = 0, maxshift; @@ -1440,7 +1449,7 @@ void update_wall_time(void) */ clock_set |= accumulate_nsecs_to_secs(tk); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); /* Update clock->cycle_last with the new value */ clock->cycle_last = tk->cycle_last; /* @@ -1450,12 +1459,12 @@ void update_wall_time(void) * requires changes to all other timekeeper usage sites as * well, i.e. move the timekeeper pointer getter into the * spinlocked/seqcount protected sections. And we trade this - * memcpy under the timekeeper_seq against one before we start + * memcpy under the tk_core.seq against one before we start * updating. */ memcpy(real_tk, tk, sizeof(*tk)); timekeeping_update(real_tk, clock_set); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); if (clock_set) @@ -1476,7 +1485,7 @@ out: */ void getboottime(struct timespec *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct timespec boottime = { .tv_sec = tk->wall_to_monotonic.tv_sec + tk->total_sleep_time.tv_sec, @@ -1499,7 +1508,7 @@ EXPORT_SYMBOL_GPL(getboottime); */ void get_monotonic_boottime(struct timespec *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 tomono, sleep, ret; s64 nsec; unsigned int seq; @@ -1507,13 +1516,13 @@ void get_monotonic_boottime(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); ret.tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; sleep = tk->total_sleep_time; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); ret.tv_sec += tomono.tv_sec + sleep.tv_sec; ret.tv_nsec = 0; @@ -1545,7 +1554,7 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime); */ void monotonic_to_bootbased(struct timespec *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts64; ts64 = timespec_to_timespec64(*ts); @@ -1556,7 +1565,7 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; return tk->xtime_sec; } @@ -1564,22 +1573,22 @@ EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; return timespec64_to_timespec(tk_xtime(tk)); } struct timespec current_kernel_time(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now; unsigned long seq; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); now = tk_xtime(tk); - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); return timespec64_to_timespec(now); } @@ -1587,16 +1596,16 @@ EXPORT_SYMBOL(current_kernel_time); struct timespec get_monotonic_coarse(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now, mono; unsigned long seq; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); now = tk_xtime(tk); mono = tk->wall_to_monotonic; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); @@ -1624,19 +1633,19 @@ void do_timer(unsigned long ticks) ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts; ktime_t now; unsigned int seq; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); ts = tk_xtime(tk); *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); now = ktime_set(ts.tv_sec, ts.tv_nsec); now = ktime_sub(now, *offs_real); @@ -1656,13 +1665,13 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; ktime_t now; unsigned int seq; u64 secs, nsecs; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); secs = tk->xtime_sec; nsecs = timekeeping_get_ns(tk); @@ -1670,7 +1679,7 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); now = ktime_add_ns(ktime_set(secs, 0), nsecs); now = ktime_sub(now, *offs_real); @@ -1683,14 +1692,14 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, */ ktime_t ktime_get_monotonic_offset(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; struct timespec64 wtom; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); wtom = tk->wall_to_monotonic; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); return timespec64_to_ktime(wtom); } @@ -1701,7 +1710,7 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); */ int do_adjtimex(struct timex *txc) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; struct timespec64 ts; s32 orig_tai, tai; @@ -1726,7 +1735,7 @@ int do_adjtimex(struct timex *txc) getnstimeofday64(&ts); raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); orig_tai = tai = tk->tai_offset; ret = __do_adjtimex(txc, &ts, &tai); @@ -1735,7 +1744,7 @@ int do_adjtimex(struct timex *txc) __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); } - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); if (tai != orig_tai) @@ -1755,11 +1764,11 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); __hardpps(phase_ts, raw_ts); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); -- cgit v1.1 From f111adfdd7ff7d9fe54b6efa440b80824984749c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:09 +0000 Subject: timekeeping: Use timekeeping_update() instead of memcpy() We already have a function which does the right thing, that also makes sure that the coming ktime_t based cached values are getting updated. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7ca150a..bfe3ea0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -883,8 +883,7 @@ void __init timekeeping_init(void) tmp.tv_nsec = 0; tk_set_sleep_time(tk, tmp); - memcpy(&shadow_timekeeper, &tk_core.timekeeper, - sizeof(tk_core.timekeeper)); + timekeeping_update(tk, TK_MIRROR); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); -- cgit v1.1 From 7c032df5570388044b4efda3d9f4d2ffb96a3116 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:10 +0000 Subject: timekeeping: Provide internal ktime_t based data The ktime_t based interfaces are used a lot in performance critical code pathes. Add ktime_t based data so the interfaces don't have to convert from the xtime/timespec based data. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index bfe3ea0..86a9247 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -291,6 +291,26 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); +/* + * Update the ktime_t based scalar nsec members of the timekeeper + */ +static inline void tk_update_ktime_data(struct timekeeper *tk) +{ + s64 nsec; + + /* + * The xtime based monotonic readout is: + * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now(); + * The ktime based monotonic readout is: + * nsec = base_mono + now(); + * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + */ + nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); + nsec *= NSEC_PER_SEC; + nsec += tk->wall_to_monotonic.tv_nsec; + tk->base_mono = ns_to_ktime(nsec); +} + /* must hold timekeeper_lock */ static void timekeeping_update(struct timekeeper *tk, unsigned int action) { @@ -301,6 +321,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + tk_update_ktime_data(tk); + if (action & TK_MIRROR) memcpy(&shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); -- cgit v1.1 From a016a5bd62e29a738531d9d4d925037a1fdb52f5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:12 +0000 Subject: timekeeping: Use ktime_t based data for ktime_get() Speed up ktime_get() by using ktime_t based data. Text size shrinks by 64 bytes on x8664. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 86a9247..d5be142 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -407,18 +407,19 @@ ktime_t ktime_get(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; - s64 secs, nsecs; + ktime_t base; + s64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); - secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; + base = tk->base_mono; + nsecs = timekeeping_get_ns(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); - return ktime_set(secs, nsecs); + return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get); -- cgit v1.1 From 0077dc60f274b9a7e9aa705a34784fefb87e0eee Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:13 +0000 Subject: timekeeping: Provide ktime_get_with_offset() Provide a helper function which lets us implement ktime_t based interfaces for real, boot and tai clocks. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d5be142..7c5f5e4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -423,6 +423,33 @@ ktime_t ktime_get(void) } EXPORT_SYMBOL_GPL(ktime_get); +static ktime_t *offsets[TK_OFFS_MAX] = { + [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, + [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, + [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, +}; + +ktime_t ktime_get_with_offset(enum tk_offsets offs) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base, *offset = offsets[offs]; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = ktime_add(tk->base_mono, *offset); + nsecs = timekeeping_get_ns(tk); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); + +} +EXPORT_SYMBOL_GPL(ktime_get_with_offset); + /** * ktime_get_ts64 - get the monotonic clock in timespec64 format * @ts: pointer to timespec variable -- cgit v1.1 From f5264d5d5a0729306cc792d84432b97785d2662a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:14 +0000 Subject: timekeeping: Use ktime_t based data for ktime_get_real() Speed up the readout. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7c5f5e4..56db2e1 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -774,21 +774,6 @@ int timekeeping_notify(struct clocksource *clock) } /** - * ktime_get_real - get the real (wall-) time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get_real(void) -{ - struct timespec64 now; - - getnstimeofday64(&now); - - return timespec64_to_ktime(now); -} -EXPORT_SYMBOL_GPL(ktime_get_real); - -/** * getrawmonotonic - Returns the raw monotonic time in a timespec * @ts: pointer to the timespec to be set * -- cgit v1.1 From b82c817e2d16e818c472eb71019de521816000a3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:16 +0000 Subject: timekeeping; Use ktime_t based data for ktime_get_boottime() Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 56db2e1..5e60aa0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1566,23 +1566,6 @@ void get_monotonic_boottime(struct timespec *ts) EXPORT_SYMBOL_GPL(get_monotonic_boottime); /** - * ktime_get_boottime - Returns monotonic time since boot in a ktime - * - * Returns the monotonic time since boot in a ktime - * - * This is similar to CLOCK_MONTONIC/ktime_get, but also - * includes the time spent in suspend. - */ -ktime_t ktime_get_boottime(void) -{ - struct timespec ts; - - get_monotonic_boottime(&ts); - return timespec_to_ktime(ts); -} -EXPORT_SYMBOL_GPL(ktime_get_boottime); - -/** * monotonic_to_bootbased - Convert the monotonic time to boot based. * @ts: pointer to the timespec to be converted */ -- cgit v1.1 From afab07c0e91ecf098abf34573ccfcd86d6be26f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:17 +0000 Subject: timekeeping: Use ktime_t based data for ktime_get_clocktai() Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5e60aa0..c083ae2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -512,21 +512,6 @@ void timekeeping_clocktai(struct timespec *ts) } EXPORT_SYMBOL(timekeeping_clocktai); - -/** - * ktime_get_clocktai - Returns the TAI time of day in a ktime - * - * Returns the time of day in a ktime. - */ -ktime_t ktime_get_clocktai(void) -{ - struct timespec ts; - - timekeeping_clocktai(&ts); - return timespec_to_ktime(ts); -} -EXPORT_SYMBOL(ktime_get_clocktai); - #ifdef CONFIG_NTP_PPS /** -- cgit v1.1 From a37c0aad6093575b52432b47b145304f1af18dff Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:19 +0000 Subject: timekeeping: Use ktime_t data for ktime_get_update_offsets_now() No need to juggle with timespecs. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c083ae2..54d9052 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1668,14 +1668,14 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; - ktime_t now; unsigned int seq; - u64 secs, nsecs; + ktime_t base; + u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); - secs = tk->xtime_sec; + base = tk->base_mono; nsecs = timekeeping_get_ns(tk); *offs_real = tk->offs_real; @@ -1683,9 +1683,7 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, *offs_tai = tk->offs_tai; } while (read_seqcount_retry(&tk_core.seq, seq)); - now = ktime_add_ns(ktime_set(secs, 0), nsecs); - now = ktime_sub(now, *offs_real); - return now; + return ktime_add_ns(base, nsecs); } #endif -- cgit v1.1 From 48064f5f67d58f95094305ac575d5372b58e265f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:20 +0000 Subject: timekeeping; Use ktime based data for ktime_get_update_offsets_tick() No need to juggle with timespecs. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 54d9052..e993503 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1636,22 +1636,22 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 ts; - ktime_t now; unsigned int seq; + ktime_t base; + u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); - ts = tk_xtime(tk); + base = tk->base_mono; + nsecs = tk->xtime_nsec >> tk->shift; + *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; } while (read_seqcount_retry(&tk_core.seq, seq)); - now = ktime_set(ts.tv_sec, ts.tv_nsec); - now = ktime_sub(now, *offs_real); - return now; + return ktime_add_ns(base, nsecs); } #ifdef CONFIG_HIGH_RES_TIMERS -- cgit v1.1 From 9a6b51976ea3a326b6de534beec3fd87275f4ef6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:22 +0000 Subject: timekeeping: Provide ktime_mono_to_any() ktime based conversion function to map a monotonic time stamp to a different CLOCK. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e993503..032e77a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -451,6 +451,26 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs) EXPORT_SYMBOL_GPL(ktime_get_with_offset); /** + * ktime_mono_to_any() - convert mononotic time to any other time + * @tmono: time to convert. + * @offs: which offset to use + */ +ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) +{ + ktime_t *offset = offsets[offs]; + unsigned long seq; + ktime_t tconv; + + do { + seq = read_seqcount_begin(&tk_core.seq); + tconv = ktime_add(tmono, *offset); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return tconv; +} +EXPORT_SYMBOL_GPL(ktime_mono_to_any); + +/** * ktime_get_ts64 - get the monotonic clock in timespec64 format * @ts: pointer to timespec variable * -- cgit v1.1 From dcaab54e348c5b66cca4802815ceebd37059e70c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:28 +0000 Subject: timekeeping: Remove ktime_get_monotonic_offset() No more users. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 032e77a..f7378ea 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1708,24 +1708,6 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, #endif /** - * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format - */ -ktime_t ktime_get_monotonic_offset(void) -{ - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; - struct timespec64 wtom; - - do { - seq = read_seqcount_begin(&tk_core.seq); - wtom = tk->wall_to_monotonic; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - return timespec64_to_ktime(wtom); -} -EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); - -/** * do_adjtimex() - Accessor function to NTP __do_adjtimex function */ int do_adjtimex(struct timex *txc) -- cgit v1.1 From d560fed6abe0f9975b509e4fb824e08ac19adc93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:31 +0000 Subject: time: Export nsecs_to_jiffies() Required for moving drivers to the nanosecond based interfaces. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/time.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 278c63c..f0294ba 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -757,6 +757,7 @@ unsigned long nsecs_to_jiffies(u64 n) { return (unsigned long)nsecs_to_jiffies64(n); } +EXPORT_SYMBOL_GPL(nsecs_to_jiffies); /* * Add two timespec values and do a safety check for overflow. -- cgit v1.1 From 57e0be041d9e21a7397eed3b67a7936ac4ac83c0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:32 +0000 Subject: sched: Make task->real_start_time nanoseconds based Simplify the only user of this data by removing the timespec conversion. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/fork.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8f54193..a7ab82d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1263,8 +1263,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, posix_cpu_timers_init(p); ktime_get_ts(&p->start_time); - p->real_start_time = p->start_time; - monotonic_to_bootbased(&p->real_start_time); + p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; if (clone_flags & CLONE_THREAD) -- cgit v1.1 From ccbf62d8a284cf181ac28c8e8407dd077d90dd4b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:34 +0000 Subject: sched: Make task->start_time nanoseconds based Simplify the timespec to nsec/usec conversions. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/acct.c | 10 +++------- kernel/fork.c | 2 +- kernel/tsacct.c | 19 +++++++++---------- 3 files changed, 13 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 1be013c..a1844f1 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -458,9 +458,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, acct_t ac; mm_segment_t fs; unsigned long flim; - u64 elapsed; - u64 run_time; - struct timespec uptime; + u64 elapsed, run_time; struct tty_struct *tty; const struct cred *orig_cred; @@ -484,10 +482,8 @@ static void do_acct_process(struct bsd_acct_struct *acct, strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); /* calculate run_time in nsec*/ - ktime_get_ts(&uptime); - run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; - run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC - + current->group_leader->start_time.tv_nsec; + run_time = ktime_get_ns(); + run_time -= current->group_leader->start_time; /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); #if ACCT_VERSION==3 diff --git a/kernel/fork.c b/kernel/fork.c index a7ab82d..627b7f8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1262,7 +1262,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, posix_cpu_timers_init(p); - ktime_get_ts(&p->start_time); + p->start_time = ktime_get_ns(); p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index ea6d170..975cb49 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -31,20 +31,19 @@ void bacct_add_tsk(struct user_namespace *user_ns, struct taskstats *stats, struct task_struct *tsk) { const struct cred *tcred; - struct timespec uptime, ts; cputime_t utime, stime, utimescaled, stimescaled; - u64 ac_etime; + u64 delta; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); - /* calculate task elapsed time in timespec */ - ktime_get_ts(&uptime); - ts = timespec_sub(uptime, tsk->start_time); - /* rebase elapsed time to usec (should never be negative) */ - ac_etime = timespec_to_ns(&ts); - do_div(ac_etime, NSEC_PER_USEC); - stats->ac_etime = ac_etime; - stats->ac_btime = get_seconds() - ts.tv_sec; + /* calculate task elapsed time in nsec */ + delta = ktime_get_ns() - tsk->start_time; + /* Convert to micro seconds */ + do_div(delta, NSEC_PER_USEC); + stats->ac_etime = delta; + /* Convert to seconds for btime */ + do_div(delta, USEC_PER_SEC); + stats->ac_btime = get_seconds() - delta; if (thread_group_leader(tsk)) { stats->ac_exitcode = tsk->exit_code; if (tsk->flags & PF_FORKNOEXEC) -- cgit v1.1 From 9667a23db0dc0bd4892f0ada7e4e71528eaeed62 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:35 +0000 Subject: delayacct: Make accounting nanosecond based Kill the timespec juggling and calculate with plain nanoseconds. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/delayacct.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index de699f4..cf2e65d 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -46,32 +46,25 @@ void __delayacct_tsk_init(struct task_struct *tsk) } /* - * Finish delay accounting for a statistic using - * its timestamps (@start, @end), accumalator (@total) and @count + * Finish delay accounting for a statistic using its timestamps (@start), + * accumalator (@total) and @count */ - -static void delayacct_end(struct timespec *start, struct timespec *end, - u64 *total, u32 *count) +static void delayacct_end(u64 *start, u64 *total, u32 *count) { - struct timespec ts; - s64 ns; + s64 ns = ktime_get_ns() - *start; unsigned long flags; - ktime_get_ts(end); - ts = timespec_sub(*end, *start); - ns = timespec_to_ns(&ts); - if (ns < 0) - return; - - spin_lock_irqsave(¤t->delays->lock, flags); - *total += ns; - (*count)++; - spin_unlock_irqrestore(¤t->delays->lock, flags); + if (ns > 0) { + spin_lock_irqsave(¤t->delays->lock, flags); + *total += ns; + (*count)++; + spin_unlock_irqrestore(¤t->delays->lock, flags); + } } void __delayacct_blkio_start(void) { - ktime_get_ts(¤t->delays->blkio_start); + current->delays->blkio_start = ktime_get_ns(); } void __delayacct_blkio_end(void) @@ -79,12 +72,10 @@ void __delayacct_blkio_end(void) if (current->delays->flags & DELAYACCT_PF_SWAPIN) /* Swapin block I/O */ delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_end, ¤t->delays->swapin_delay, ¤t->delays->swapin_count); else /* Other block I/O */ delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_end, ¤t->delays->blkio_delay, ¤t->delays->blkio_count); } @@ -159,13 +150,12 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) void __delayacct_freepages_start(void) { - ktime_get_ts(¤t->delays->freepages_start); + current->delays->freepages_start = ktime_get_ns(); } void __delayacct_freepages_end(void) { delayacct_end(¤t->delays->freepages_start, - ¤t->delays->freepages_end, ¤t->delays->freepages_delay, ¤t->delays->freepages_count); } -- cgit v1.1 From 68f6783d28316affcd2ce332d949e40e4c7416bd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:37 +0000 Subject: delayacct: Remove braindamaged type conversions Converting cputime to timespec and timespec to nanoseconds makes no sense. Use cputime_to_ns() and be done with it. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/delayacct.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index cf2e65d..ef90b04 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -82,23 +82,19 @@ void __delayacct_blkio_end(void) int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { - s64 tmp; - unsigned long t1; - unsigned long long t2, t3; - unsigned long flags; - struct timespec ts; cputime_t utime, stime, stimescaled, utimescaled; + unsigned long long t2, t3; + unsigned long flags, t1; + s64 tmp; - tmp = (s64)d->cpu_run_real_total; task_cputime(tsk, &utime, &stime); - cputime_to_timespec(utime + stime, &ts); - tmp += timespec_to_ns(&ts); + tmp = (s64)d->cpu_run_real_total; + tmp += cputime_to_nsecs(utime + stime); d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; - tmp = (s64)d->cpu_scaled_run_real_total; task_cputime_scaled(tsk, &utimescaled, &stimescaled); - cputime_to_timespec(utimescaled + stimescaled, &ts); - tmp += timespec_to_ns(&ts); + tmp = (s64)d->cpu_scaled_run_real_total; + tmp += cputime_to_nsecs(utimescaled + stimescaled); d->cpu_scaled_run_real_total = (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; -- cgit v1.1 From 250fade8af2ac5dda8d5106ea06738b6f9e768a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:55 +0000 Subject: timekeeping: Remove monotonic_to_bootbased No more users. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f7378ea..b356135 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1570,21 +1570,6 @@ void get_monotonic_boottime(struct timespec *ts) } EXPORT_SYMBOL_GPL(get_monotonic_boottime); -/** - * monotonic_to_bootbased - Convert the monotonic time to boot based. - * @ts: pointer to the timespec to be converted - */ -void monotonic_to_bootbased(struct timespec *ts) -{ - struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 ts64; - - ts64 = timespec_to_timespec64(*ts); - ts64 = timespec64_add(ts64, tk->total_sleep_time); - *ts = timespec64_to_timespec(ts64); -} -EXPORT_SYMBOL_GPL(monotonic_to_bootbased); - unsigned long get_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; -- cgit v1.1 From 48f18fd6addc199f330d838d54fe7b0a0892adaa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:57 +0000 Subject: timekeeping: Use ktime_get_boottime() for get_monotonic_boottime() get_monotonic_boottime() is not used in fast pathes, so the extra timespec conversion is not problematic. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 34 ---------------------------------- 1 file changed, 34 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b356135..f63476f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1536,40 +1536,6 @@ void getboottime(struct timespec *ts) } EXPORT_SYMBOL_GPL(getboottime); -/** - * get_monotonic_boottime - Returns monotonic time since boot - * @ts: pointer to the timespec to be set - * - * Returns the monotonic time since boot in a timespec. - * - * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also - * includes the time spent in suspend. - */ -void get_monotonic_boottime(struct timespec *ts) -{ - struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 tomono, sleep, ret; - s64 nsec; - unsigned int seq; - - WARN_ON(timekeeping_suspended); - - do { - seq = read_seqcount_begin(&tk_core.seq); - ret.tv_sec = tk->xtime_sec; - nsec = timekeeping_get_ns(tk); - tomono = tk->wall_to_monotonic; - sleep = tk->total_sleep_time; - - } while (read_seqcount_retry(&tk_core.seq, seq)); - - ret.tv_sec += tomono.tv_sec + sleep.tv_sec; - ret.tv_nsec = 0; - timespec64_add_ns(&ret, nsec + tomono.tv_nsec + sleep.tv_nsec); - *ts = timespec64_to_timespec(ret); -} -EXPORT_SYMBOL_GPL(get_monotonic_boottime); - unsigned long get_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; -- cgit v1.1 From 02cba1598a2a3b689e79ad6dad2532521f638271 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:04:58 +0000 Subject: timekeeping: Simplify getboottime() Subtracting plain nsec values and converting to timespec is simpler than the whole timespec math. Not really fastpath code, so the division is not an issue. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f63476f..3edc0c1 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1525,14 +1525,9 @@ out: void getboottime(struct timespec *ts) { struct timekeeper *tk = &tk_core.timekeeper; - struct timespec boottime = { - .tv_sec = tk->wall_to_monotonic.tv_sec + - tk->total_sleep_time.tv_sec, - .tv_nsec = tk->wall_to_monotonic.tv_nsec + - tk->total_sleep_time.tv_nsec - }; - - set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); + ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); + + *ts = ktime_to_timespec(t); } EXPORT_SYMBOL_GPL(getboottime); -- cgit v1.1 From 47da70d32535000ec29cc206cfc1d318fbd8761f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:05:00 +0000 Subject: timekeeping: Remove timekeeper.total_sleep_time No more users. Remove it Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3edc0c1..50d5de0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -97,13 +97,9 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); } -static void tk_set_sleep_time(struct timekeeper *tk, struct timespec64 t) +static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { - /* Verify consistency before modifying */ - WARN_ON_ONCE(tk->offs_boot.tv64 != timespec64_to_ktime(tk->total_sleep_time).tv64); - - tk->total_sleep_time = t; - tk->offs_boot = timespec64_to_ktime(t); + tk->offs_boot = ktime_add(tk->offs_boot, delta); } /** @@ -919,10 +915,6 @@ void __init timekeeping_init(void) set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); tk_set_wall_to_mono(tk, tmp); - tmp.tv_sec = 0; - tmp.tv_nsec = 0; - tk_set_sleep_time(tk, tmp); - timekeeping_update(tk, TK_MIRROR); write_seqcount_end(&tk_core.seq); @@ -950,7 +942,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, } tk_xtime_add(tk, delta); tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); - tk_set_sleep_time(tk, timespec64_add(tk->total_sleep_time, *delta)); + tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } -- cgit v1.1 From 61edec81d260bc96a73c878bbdb4c614460346da Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:05:01 +0000 Subject: timekeeping: Simplify timekeeping_clocktai() timekeeping_clocktai() is not used in fast pathes, so the extra timespec conversion is not problematic. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 31 ------------------------------- 1 file changed, 31 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 50d5de0..118e91e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -497,37 +497,6 @@ void ktime_get_ts64(struct timespec64 *ts) } EXPORT_SYMBOL_GPL(ktime_get_ts64); - -/** - * timekeeping_clocktai - Returns the TAI time of day in a timespec - * @ts: pointer to the timespec to be set - * - * Returns the time of day in a timespec. - */ -void timekeeping_clocktai(struct timespec *ts) -{ - struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 ts64; - unsigned long seq; - u64 nsecs; - - WARN_ON(timekeeping_suspended); - - do { - seq = read_seqcount_begin(&tk_core.seq); - - ts64.tv_sec = tk->xtime_sec + tk->tai_offset; - nsecs = timekeeping_get_ns(tk); - - } while (read_seqcount_retry(&tk_core.seq, seq)); - - ts64.tv_nsec = 0; - timespec64_add_ns(&ts64, nsecs); - *ts = timespec64_to_timespec(ts64); - -} -EXPORT_SYMBOL(timekeeping_clocktai); - #ifdef CONFIG_NTP_PPS /** -- cgit v1.1 From f519b1a2e08c913375324a927992bb328387f169 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:05:04 +0000 Subject: timekeeping: Provide ktime_get_raw() Provide a ktime_t based interface for raw monotonic time. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 118e91e..af8051f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -305,6 +305,9 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) nsec *= NSEC_PER_SEC; nsec += tk->wall_to_monotonic.tv_nsec; tk->base_mono = ns_to_ktime(nsec); + + /* Update the monotonic raw base */ + tk->base_raw = timespec64_to_ktime(tk->raw_time); } /* must hold timekeeper_lock */ @@ -467,6 +470,27 @@ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) EXPORT_SYMBOL_GPL(ktime_mono_to_any); /** + * ktime_get_raw - Returns the raw monotonic time in ktime_t format + */ +ktime_t ktime_get_raw(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + s64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = tk->base_raw; + nsecs = timekeeping_get_ns_raw(tk); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_raw); + +/** * ktime_get_ts64 - get the monotonic clock in timespec64 format * @ts: pointer to timespec variable * @@ -878,6 +902,7 @@ void __init timekeeping_init(void) tk_set_xtime(tk, &now); tk->raw_time.tv_sec = 0; tk->raw_time.tv_nsec = 0; + tk->base_raw.tv64 = 0; if (boot.tv_sec == 0 && boot.tv_nsec == 0) boot = tk_xtime(tk); -- cgit v1.1 From 3a97837784acbf9fed699fc04d1799b0eb742fdf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:05:10 +0000 Subject: clocksource: Make delta calculation a function We want to move the TSC sanity check into core code to make NMI safe accessors to clock monotonic[_raw] possible. For this we need to sanity check the delta calculation. Create a helper function and convert all sites to use it. [ Build fix from jstultz ] Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/clocksource.c | 12 +++++++----- kernel/time/timekeeping.c | 26 ++++++++++++++------------ kernel/time/timekeeping_internal.h | 6 ++++++ 3 files changed, 27 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ba3e502..2e949cc 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -32,6 +32,7 @@ #include #include "tick-internal.h" +#include "timekeeping_internal.h" void timecounter_init(struct timecounter *tc, const struct cyclecounter *cc, @@ -249,7 +250,7 @@ void clocksource_mark_unstable(struct clocksource *cs) static void clocksource_watchdog(unsigned long data) { struct clocksource *cs; - cycle_t csnow, wdnow; + cycle_t csnow, wdnow, delta; int64_t wd_nsec, cs_nsec; int next_cpu, reset_pending; @@ -282,11 +283,12 @@ static void clocksource_watchdog(unsigned long data) continue; } - wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, - watchdog->mult, watchdog->shift); + delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask); + wd_nsec = clocksource_cyc2ns(delta, watchdog->mult, + watchdog->shift); - cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & - cs->mask, cs->mult, cs->shift); + delta = clocksource_delta(csnow, cs->cs_last, cs->mask); + cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); cs->cs_last = csnow; cs->wd_last = wdnow; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index af8051f..5318050 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -173,7 +173,7 @@ static inline u32 arch_gettimeoffset(void) { return 0; } static inline s64 timekeeping_get_ns(struct timekeeper *tk) { - cycle_t cycle_now, cycle_delta; + cycle_t cycle_now, delta; struct clocksource *clock; s64 nsec; @@ -182,9 +182,9 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk) cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + delta = clocksource_delta(cycle_now, clock->cycle_last, clock->mask); - nsec = cycle_delta * tk->mult + tk->xtime_nsec; + nsec = delta * tk->mult + tk->xtime_nsec; nsec >>= tk->shift; /* If arch requires, add in get_arch_timeoffset() */ @@ -193,7 +193,7 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk) static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) { - cycle_t cycle_now, cycle_delta; + cycle_t cycle_now, delta; struct clocksource *clock; s64 nsec; @@ -202,10 +202,10 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + delta = clocksource_delta(cycle_now, clock->cycle_last, clock->mask); /* convert delta to nanoseconds. */ - nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); /* If arch requires, add in get_arch_timeoffset() */ return nsec + arch_gettimeoffset(); @@ -336,23 +336,23 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) */ static void timekeeping_forward_now(struct timekeeper *tk) { - cycle_t cycle_now, cycle_delta; + cycle_t cycle_now, delta; struct clocksource *clock; s64 nsec; clock = tk->clock; cycle_now = clock->read(clock); - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + delta = clocksource_delta(cycle_now, clock->cycle_last, clock->mask); tk->cycle_last = clock->cycle_last = cycle_now; - tk->xtime_nsec += cycle_delta * tk->mult; + tk->xtime_nsec += delta * tk->mult; /* If arch requires, add in get_arch_timeoffset() */ tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; tk_normalize_xtime(tk); - nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); timespec64_add_ns(&tk->raw_time, nsec); } @@ -1026,7 +1026,8 @@ static void timekeeping_resume(void) u32 shift = clock->shift; s64 nsec = 0; - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + cycle_delta = clocksource_delta(cycle_now, clock->cycle_last, + clock->mask); /* * "cycle_delta * mutl" may cause 64 bits overflow, if the @@ -1432,7 +1433,8 @@ void update_wall_time(void) #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = real_tk->cycle_interval; #else - offset = (clock->read(clock) - clock->cycle_last) & clock->mask; + offset = clocksource_delta(clock->read(clock), clock->cycle_last, + clock->mask); #endif /* Check if there's really nothing to do */ diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index e3d28ad..05dfa6b 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -3,6 +3,7 @@ /* * timekeeping debug functions */ +#include #include #ifdef CONFIG_DEBUG_FS @@ -11,4 +12,9 @@ extern void tk_debug_account_sleep_time(struct timespec64 *t); #define tk_debug_account_sleep_time(x) #endif +static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) +{ + return (now - last) & mask; +} + #endif /* _TIMEKEEPING_INTERNAL_H */ -- cgit v1.1 From 09ec54429c6d10f87d1f084de53ae2c1c3a81108 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:05:12 +0000 Subject: clocksource: Move cycle_last validation to core code The only user of the cycle_last validation is the x86 TSC. In order to provide NMI safe accessor functions for clock monotonic and monotonic_raw we need to do that in the core. We can't do the TSC specific if (now < cycle_last) now = cycle_last; for the other wrapping around clocksources, but TSC has CLOCKSOURCE_MASK(64) which actually does not mask out anything so if now is less than cycle_last the subtraction will give a negative result. So we can check for that in clocksource_delta() and return 0 for that case. Implement and enable it for x86 Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/Kconfig | 5 +++++ kernel/time/timekeeping_internal.h | 9 +++++++++ 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index feccfd8..d626dc9 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG config ARCH_CLOCKSOURCE_DATA bool +# Clocksources require validation of the clocksource against the last +# cycle update - x86/TSC misfeature +config CLOCKSOURCE_VALIDATE_LAST_CYCLE + bool + # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 05dfa6b..4ea005a 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -12,9 +12,18 @@ extern void tk_debug_account_sleep_time(struct timespec64 *t); #define tk_debug_account_sleep_time(x) #endif +#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE +static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) +{ + cycle_t ret = (now - last) & mask; + + return (s64) ret > 0 ? ret : 0; +} +#else static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) { return (now - last) & mask; } +#endif #endif /* _TIMEKEEPING_INTERNAL_H */ -- cgit v1.1 From 4a0e637738f06673725792d74eed67f8779b62c7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:05:13 +0000 Subject: clocksource: Get rid of cycle_last cycle_last was added to the clocksource to support the TSC validation. We moved that to the core code, so we can get rid of the extra copy. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5318050..4e748c4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -121,7 +121,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) old_clock = tk->clock; tk->clock = clock; - tk->cycle_last = clock->cycle_last = clock->read(clock); + tk->cycle_last = clock->read(clock); /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; @@ -182,7 +182,7 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk) cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ - delta = clocksource_delta(cycle_now, clock->cycle_last, clock->mask); + delta = clocksource_delta(cycle_now, tk->cycle_last, clock->mask); nsec = delta * tk->mult + tk->xtime_nsec; nsec >>= tk->shift; @@ -202,7 +202,7 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ - delta = clocksource_delta(cycle_now, clock->cycle_last, clock->mask); + delta = clocksource_delta(cycle_now, tk->cycle_last, clock->mask); /* convert delta to nanoseconds. */ nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); @@ -218,7 +218,8 @@ static inline void update_vsyscall(struct timekeeper *tk) struct timespec xt; xt = tk_xtime(tk); - update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); + update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult, + tk->cycle_last); } static inline void old_vsyscall_fixup(struct timekeeper *tk) @@ -342,8 +343,8 @@ static void timekeeping_forward_now(struct timekeeper *tk) clock = tk->clock; cycle_now = clock->read(clock); - delta = clocksource_delta(cycle_now, clock->cycle_last, clock->mask); - tk->cycle_last = clock->cycle_last = cycle_now; + delta = clocksource_delta(cycle_now, tk->cycle_last, clock->mask); + tk->cycle_last = cycle_now; tk->xtime_nsec += delta * tk->mult; @@ -1020,13 +1021,13 @@ static void timekeeping_resume(void) */ cycle_now = clock->read(clock); if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && - cycle_now > clock->cycle_last) { + cycle_now > tk->cycle_last) { u64 num, max = ULLONG_MAX; u32 mult = clock->mult; u32 shift = clock->shift; s64 nsec = 0; - cycle_delta = clocksource_delta(cycle_now, clock->cycle_last, + cycle_delta = clocksource_delta(cycle_now, tk->cycle_last, clock->mask); /* @@ -1053,7 +1054,7 @@ static void timekeeping_resume(void) __timekeeping_inject_sleeptime(tk, &ts_delta); /* Re-base the last cycle value */ - tk->cycle_last = clock->cycle_last = cycle_now; + tk->cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); @@ -1433,7 +1434,7 @@ void update_wall_time(void) #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = real_tk->cycle_interval; #else - offset = clocksource_delta(clock->read(clock), clock->cycle_last, + offset = clocksource_delta(clock->read(clock), tk->cycle_last, clock->mask); #endif @@ -1477,8 +1478,6 @@ void update_wall_time(void) clock_set |= accumulate_nsecs_to_secs(tk); write_seqcount_begin(&tk_core.seq); - /* Update clock->cycle_last with the new value */ - clock->cycle_last = tk->cycle_last; /* * Update the real timekeeper. * -- cgit v1.1 From 6d3aadf3e180e09dbefab16478c6876b584ce16e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:05:15 +0000 Subject: timekeeping: Restructure the timekeeper some more Access to time requires to touch two cachelines at minimum 1) The timekeeper data structure 2) The clocksource data structure The access to the clocksource data structure can be avoided as almost all clocksource implementations ignore the argument to the read callback, which is a pointer to the clocksource. But the core needs to touch it to access the members @read and @mask. So we are better off by copying the @read function pointer and the @mask from the clocksource to the core data structure itself. For the most used ktime_get() access all required data including the @read and @mask copies fits together with the sequence counter into a single 64 byte cacheline. For the other time access functions we touch in the current code three cache lines in the worst case. But with the clocksource data copies we can reduce that to two adjacent cachelines, which is more efficient than disjunct cache lines. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4e748c4..14b7367 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -121,7 +121,9 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) old_clock = tk->clock; tk->clock = clock; - tk->cycle_last = clock->read(clock); + tk->read = clock->read; + tk->mask = clock->mask; + tk->cycle_last = tk->read(clock); /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; @@ -174,15 +176,13 @@ static inline u32 arch_gettimeoffset(void) { return 0; } static inline s64 timekeeping_get_ns(struct timekeeper *tk) { cycle_t cycle_now, delta; - struct clocksource *clock; s64 nsec; /* read clocksource: */ - clock = tk->clock; - cycle_now = clock->read(clock); + cycle_now = tk->read(tk->clock); /* calculate the delta since the last update_wall_time: */ - delta = clocksource_delta(cycle_now, tk->cycle_last, clock->mask); + delta = clocksource_delta(cycle_now, tk->cycle_last, tk->mask); nsec = delta * tk->mult + tk->xtime_nsec; nsec >>= tk->shift; @@ -193,16 +193,15 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk) static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) { + struct clocksource *clock = tk->clock; cycle_t cycle_now, delta; - struct clocksource *clock; s64 nsec; /* read clocksource: */ - clock = tk->clock; - cycle_now = clock->read(clock); + cycle_now = tk->read(clock); /* calculate the delta since the last update_wall_time: */ - delta = clocksource_delta(cycle_now, tk->cycle_last, clock->mask); + delta = clocksource_delta(cycle_now, tk->cycle_last, tk->mask); /* convert delta to nanoseconds. */ nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); @@ -337,13 +336,12 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) */ static void timekeeping_forward_now(struct timekeeper *tk) { + struct clocksource *clock = tk->clock; cycle_t cycle_now, delta; - struct clocksource *clock; s64 nsec; - clock = tk->clock; - cycle_now = clock->read(clock); - delta = clocksource_delta(cycle_now, tk->cycle_last, clock->mask); + cycle_now = tk->read(clock); + delta = clocksource_delta(cycle_now, tk->cycle_last, tk->mask); tk->cycle_last = cycle_now; tk->xtime_nsec += delta * tk->mult; @@ -1019,7 +1017,7 @@ static void timekeeping_resume(void) * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ - cycle_now = clock->read(clock); + cycle_now = tk->read(clock); if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && cycle_now > tk->cycle_last) { u64 num, max = ULLONG_MAX; @@ -1028,7 +1026,7 @@ static void timekeeping_resume(void) s64 nsec = 0; cycle_delta = clocksource_delta(cycle_now, tk->cycle_last, - clock->mask); + tk->mask); /* * "cycle_delta * mutl" may cause 64 bits overflow, if the @@ -1415,7 +1413,6 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, */ void update_wall_time(void) { - struct clocksource *clock; struct timekeeper *real_tk = &tk_core.timekeeper; struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; @@ -1429,13 +1426,11 @@ void update_wall_time(void) if (unlikely(timekeeping_suspended)) goto out; - clock = real_tk->clock; - #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = real_tk->cycle_interval; #else - offset = clocksource_delta(clock->read(clock), tk->cycle_last, - clock->mask); + offset = clocksource_delta(tk->read(tk->clock), tk->cycle_last, + tk->mask); #endif /* Check if there's really nothing to do */ -- cgit v1.1 From d28ede83791defee9a81e558540699dc46dbbe13 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:05:16 +0000 Subject: timekeeping: Create struct tk_read_base and use it in struct timekeeper The members of the new struct are the required ones for the new NMI safe accessor to clcok monotonic. In order to reuse the existing timekeeping code and to make the update of the fast NMI safe timekeepers a simple memcpy use the struct for the timekeeper as well and convert all users. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Mathieu Desnoyers Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 132 +++++++++++++++++++++++----------------------- 1 file changed, 66 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 14b7367..ccb6998 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -52,8 +52,8 @@ bool __read_mostly persistent_clock_exist = false; static inline void tk_normalize_xtime(struct timekeeper *tk) { - while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { - tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift; + while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) { + tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift; tk->xtime_sec++; } } @@ -63,20 +63,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk) struct timespec64 ts; ts.tv_sec = tk->xtime_sec; - ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); + ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift); return ts; } static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; - tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; + tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift; } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec += ts->tv_sec; - tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; + tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift; tk_normalize_xtime(tk); } @@ -119,11 +119,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) u64 tmp, ntpinterval; struct clocksource *old_clock; - old_clock = tk->clock; - tk->clock = clock; - tk->read = clock->read; - tk->mask = clock->mask; - tk->cycle_last = tk->read(clock); + old_clock = tk->tkr.clock; + tk->tkr.clock = clock; + tk->tkr.read = clock->read; + tk->tkr.mask = clock->mask; + tk->tkr.cycle_last = tk->tkr.read(clock); /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; @@ -147,11 +147,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) if (old_clock) { int shift_change = clock->shift - old_clock->shift; if (shift_change < 0) - tk->xtime_nsec >>= -shift_change; + tk->tkr.xtime_nsec >>= -shift_change; else - tk->xtime_nsec <<= shift_change; + tk->tkr.xtime_nsec <<= shift_change; } - tk->shift = clock->shift; + tk->tkr.shift = clock->shift; tk->ntp_error = 0; tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; @@ -161,7 +161,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) * active clocksource. These value will be adjusted via NTP * to counteract clock drifting. */ - tk->mult = clock->mult; + tk->tkr.mult = clock->mult; } /* Timekeeper helper functions. */ @@ -179,13 +179,13 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk) s64 nsec; /* read clocksource: */ - cycle_now = tk->read(tk->clock); + cycle_now = tk->tkr.read(tk->tkr.clock); /* calculate the delta since the last update_wall_time: */ - delta = clocksource_delta(cycle_now, tk->cycle_last, tk->mask); + delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); - nsec = delta * tk->mult + tk->xtime_nsec; - nsec >>= tk->shift; + nsec = delta * tk->tkr.mult + tk->tkr.xtime_nsec; + nsec >>= tk->tkr.shift; /* If arch requires, add in get_arch_timeoffset() */ return nsec + arch_gettimeoffset(); @@ -193,15 +193,15 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk) static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) { - struct clocksource *clock = tk->clock; + struct clocksource *clock = tk->tkr.clock; cycle_t cycle_now, delta; s64 nsec; /* read clocksource: */ - cycle_now = tk->read(clock); + cycle_now = tk->tkr.read(clock); /* calculate the delta since the last update_wall_time: */ - delta = clocksource_delta(cycle_now, tk->cycle_last, tk->mask); + delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); /* convert delta to nanoseconds. */ nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); @@ -217,8 +217,8 @@ static inline void update_vsyscall(struct timekeeper *tk) struct timespec xt; xt = tk_xtime(tk); - update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult, - tk->cycle_last); + update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult, + tk->tkr.cycle_last); } static inline void old_vsyscall_fixup(struct timekeeper *tk) @@ -235,11 +235,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD * users are removed, this can be killed. */ - remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); - tk->xtime_nsec -= remainder; - tk->xtime_nsec += 1ULL << tk->shift; + remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1); + tk->tkr.xtime_nsec -= remainder; + tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift; tk->ntp_error += remainder << tk->ntp_error_shift; - tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; + tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift; } #else #define old_vsyscall_fixup(tk) @@ -304,7 +304,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); nsec *= NSEC_PER_SEC; nsec += tk->wall_to_monotonic.tv_nsec; - tk->base_mono = ns_to_ktime(nsec); + tk->tkr.base_mono = ns_to_ktime(nsec); /* Update the monotonic raw base */ tk->base_raw = timespec64_to_ktime(tk->raw_time); @@ -336,18 +336,18 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) */ static void timekeeping_forward_now(struct timekeeper *tk) { - struct clocksource *clock = tk->clock; + struct clocksource *clock = tk->tkr.clock; cycle_t cycle_now, delta; s64 nsec; - cycle_now = tk->read(clock); - delta = clocksource_delta(cycle_now, tk->cycle_last, tk->mask); - tk->cycle_last = cycle_now; + cycle_now = tk->tkr.read(clock); + delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); + tk->tkr.cycle_last = cycle_now; - tk->xtime_nsec += delta * tk->mult; + tk->tkr.xtime_nsec += delta * tk->tkr.mult; /* If arch requires, add in get_arch_timeoffset() */ - tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; + tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift; tk_normalize_xtime(tk); @@ -412,7 +412,7 @@ ktime_t ktime_get(void) do { seq = read_seqcount_begin(&tk_core.seq); - base = tk->base_mono; + base = tk->tkr.base_mono; nsecs = timekeeping_get_ns(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -438,7 +438,7 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs) do { seq = read_seqcount_begin(&tk_core.seq); - base = ktime_add(tk->base_mono, *offset); + base = ktime_add(tk->tkr.base_mono, *offset); nsecs = timekeeping_get_ns(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -731,7 +731,7 @@ static int change_clocksource(void *data) */ if (try_module_get(new->owner)) { if (!new->enable || new->enable(new) == 0) { - old = tk->clock; + old = tk->tkr.clock; tk_setup_internals(tk, new); if (old->disable) old->disable(old); @@ -759,11 +759,11 @@ int timekeeping_notify(struct clocksource *clock) { struct timekeeper *tk = &tk_core.timekeeper; - if (tk->clock == clock) + if (tk->tkr.clock == clock) return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); - return tk->clock == clock ? 0 : -1; + return tk->tkr.clock == clock ? 0 : -1; } /** @@ -803,7 +803,7 @@ int timekeeping_valid_for_hres(void) do { seq = read_seqcount_begin(&tk_core.seq); - ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -822,7 +822,7 @@ u64 timekeeping_max_deferment(void) do { seq = read_seqcount_begin(&tk_core.seq); - ret = tk->clock->max_idle_ns; + ret = tk->tkr.clock->max_idle_ns; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -989,7 +989,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) static void timekeeping_resume(void) { struct timekeeper *tk = &tk_core.timekeeper; - struct clocksource *clock = tk->clock; + struct clocksource *clock = tk->tkr.clock; unsigned long flags; struct timespec64 ts_new, ts_delta; struct timespec tmp; @@ -1017,16 +1017,16 @@ static void timekeeping_resume(void) * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ - cycle_now = tk->read(clock); + cycle_now = tk->tkr.read(clock); if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && - cycle_now > tk->cycle_last) { + cycle_now > tk->tkr.cycle_last) { u64 num, max = ULLONG_MAX; u32 mult = clock->mult; u32 shift = clock->shift; s64 nsec = 0; - cycle_delta = clocksource_delta(cycle_now, tk->cycle_last, - tk->mask); + cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, + tk->tkr.mask); /* * "cycle_delta * mutl" may cause 64 bits overflow, if the @@ -1052,7 +1052,7 @@ static void timekeeping_resume(void) __timekeeping_inject_sleeptime(tk, &ts_delta); /* Re-base the last cycle value */ - tk->cycle_last = cycle_now; + tk->tkr.cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); @@ -1239,12 +1239,12 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) } } - if (unlikely(tk->clock->maxadj && - (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { + if (unlikely(tk->tkr.clock->maxadj && + (tk->tkr.mult + adj > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) { printk_deferred_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", - tk->clock->name, (long)tk->mult + adj, - (long)tk->clock->mult + tk->clock->maxadj); + tk->tkr.clock->name, (long)tk->tkr.mult + adj, + (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj); } /* * So the following can be confusing. @@ -1295,9 +1295,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) * * XXX - TODO: Doc ntp_error calculation. */ - tk->mult += adj; + tk->tkr.mult += adj; tk->xtime_interval += interval; - tk->xtime_nsec -= offset; + tk->tkr.xtime_nsec -= offset; tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; out_adjust: @@ -1315,9 +1315,9 @@ out_adjust: * We'll correct this error next time through this function, when * xtime_nsec is not as small. */ - if (unlikely((s64)tk->xtime_nsec < 0)) { - s64 neg = -(s64)tk->xtime_nsec; - tk->xtime_nsec = 0; + if (unlikely((s64)tk->tkr.xtime_nsec < 0)) { + s64 neg = -(s64)tk->tkr.xtime_nsec; + tk->tkr.xtime_nsec = 0; tk->ntp_error += neg << tk->ntp_error_shift; } @@ -1333,13 +1333,13 @@ out_adjust: */ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { - u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; + u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift; unsigned int clock_set = 0; - while (tk->xtime_nsec >= nsecps) { + while (tk->tkr.xtime_nsec >= nsecps) { int leap; - tk->xtime_nsec -= nsecps; + tk->tkr.xtime_nsec -= nsecps; tk->xtime_sec++; /* Figure out if its a leap sec and apply if needed */ @@ -1384,9 +1384,9 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, /* Accumulate one shifted interval */ offset -= interval; - tk->cycle_last += interval; + tk->tkr.cycle_last += interval; - tk->xtime_nsec += tk->xtime_interval << shift; + tk->tkr.xtime_nsec += tk->xtime_interval << shift; *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ @@ -1429,8 +1429,8 @@ void update_wall_time(void) #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = real_tk->cycle_interval; #else - offset = clocksource_delta(tk->read(tk->clock), tk->cycle_last, - tk->mask); + offset = clocksource_delta(tk->tkr.read(tk->tkr.clock), + tk->tkr.cycle_last, tk->tkr.mask); #endif /* Check if there's really nothing to do */ @@ -1591,8 +1591,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, do { seq = read_seqcount_begin(&tk_core.seq); - base = tk->base_mono; - nsecs = tk->xtime_nsec >> tk->shift; + base = tk->tkr.base_mono; + nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift; *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; @@ -1623,7 +1623,7 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, do { seq = read_seqcount_begin(&tk_core.seq); - base = tk->base_mono; + base = tk->tkr.base_mono; nsecs = timekeeping_get_ns(tk); *offs_real = tk->offs_real; -- cgit v1.1 From 0e5ac3a8b100469ea154f87dd57b685fbdd356f6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:05:18 +0000 Subject: timekeeping: Use tk_read_base as argument for timekeeping_get_ns() All the function needs is in the tk_read_base struct. No functional change for the current code, just a preparatory patch for the NMI safe accessor to clock monotonic which will use struct tk_read_base as well. Signed-off-by: Thomas Gleixner Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ccb6998..dee23c9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -173,19 +173,19 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; static inline u32 arch_gettimeoffset(void) { return 0; } #endif -static inline s64 timekeeping_get_ns(struct timekeeper *tk) +static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) { cycle_t cycle_now, delta; s64 nsec; /* read clocksource: */ - cycle_now = tk->tkr.read(tk->tkr.clock); + cycle_now = tkr->read(tkr->clock); /* calculate the delta since the last update_wall_time: */ - delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); + delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); - nsec = delta * tk->tkr.mult + tk->tkr.xtime_nsec; - nsec >>= tk->tkr.shift; + nsec = delta * tkr->mult + tkr->xtime_nsec; + nsec >>= tkr->shift; /* If arch requires, add in get_arch_timeoffset() */ return nsec + arch_gettimeoffset(); @@ -372,7 +372,7 @@ int __getnstimeofday64(struct timespec64 *ts) seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; - nsecs = timekeeping_get_ns(tk); + nsecs = timekeeping_get_ns(&tk->tkr); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -413,7 +413,7 @@ ktime_t ktime_get(void) do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr.base_mono; - nsecs = timekeeping_get_ns(tk); + nsecs = timekeeping_get_ns(&tk->tkr); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -439,7 +439,7 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs) do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr.base_mono, *offset); - nsecs = timekeeping_get_ns(tk); + nsecs = timekeeping_get_ns(&tk->tkr); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -509,7 +509,7 @@ void ktime_get_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; - nsec = timekeeping_get_ns(tk); + nsec = timekeeping_get_ns(&tk->tkr); tomono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -547,7 +547,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) ts_real->tv_nsec = 0; nsecs_raw = timekeeping_get_ns_raw(tk); - nsecs_real = timekeeping_get_ns(tk); + nsecs_real = timekeeping_get_ns(&tk->tkr); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -1624,7 +1624,7 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr.base_mono; - nsecs = timekeeping_get_ns(tk); + nsecs = timekeeping_get_ns(&tk->tkr); *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; -- cgit v1.1 From 4396e058c52e167729729cf64ea3dfa229637086 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:05:23 +0000 Subject: timekeeping: Provide fast and NMI safe access to CLOCK_MONOTONIC Tracers want a correlated time between the kernel instrumentation and user space. We really do not want to export sched_clock() to user space, so we need to provide something sensible for this. Using separate data structures with an non blocking sequence count based update mechanism allows us to do that. The data structure required for the readout has a sequence counter and two copies of the timekeeping data. On the update side: smp_wmb(); tkf->seq++; smp_wmb(); update(tkf->base[0], tk); smp_wmb(); tkf->seq++; smp_wmb(); update(tkf->base[1], tk); On the reader side: do { seq = tkf->seq; smp_rmb(); idx = seq & 0x01; now = now(tkf->base[idx]); smp_rmb(); } while (seq != tkf->seq) So if a NMI hits the update of base[0] it will use base[1] which is still consistent, but this timestamp is not guaranteed to be monotonic across an update. The timestamp is calculated by: now = base_mono + clock_delta * slope So if the update lowers the slope, readers who are forced to the not yet updated second array are still using the old steeper slope. tmono ^ | o n | o n | u | o |o |12345678---> reader order o = old slope u = update n = new slope So reader 6 will observe time going backwards versus reader 5. While other CPUs are likely to be able observe that, the only way for a CPU local observation is when an NMI hits in the middle of the update. Timestamps taken from that NMI context might be ahead of the following timestamps. Callers need to be aware of that and deal with it. V2: Got rid of clock monotonic raw and reorganized the data structures. Folded in the barrier fix from Mathieu. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Mathieu Desnoyers Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index dee23c9..8980fb7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -44,6 +44,22 @@ static struct { static DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct timekeeper shadow_timekeeper; +/** + * struct tk_fast - NMI safe timekeeper + * @seq: Sequence counter for protecting updates. The lowest bit + * is the index for the tk_read_base array + * @base: tk_read_base array. Access is indexed by the lowest bit of + * @seq. + * + * See @update_fast_timekeeper() below. + */ +struct tk_fast { + seqcount_t seq; + struct tk_read_base base[2]; +}; + +static struct tk_fast tk_fast_mono ____cacheline_aligned; + /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -210,6 +226,112 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) return nsec + arch_gettimeoffset(); } +/** + * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. + * @tk: The timekeeper from which we take the update + * @tkf: The fast timekeeper to update + * @tbase: The time base for the fast timekeeper (mono/raw) + * + * We want to use this from any context including NMI and tracing / + * instrumenting the timekeeping code itself. + * + * So we handle this differently than the other timekeeping accessor + * functions which retry when the sequence count has changed. The + * update side does: + * + * smp_wmb(); <- Ensure that the last base[1] update is visible + * tkf->seq++; + * smp_wmb(); <- Ensure that the seqcount update is visible + * update(tkf->base[0], tk); + * smp_wmb(); <- Ensure that the base[0] update is visible + * tkf->seq++; + * smp_wmb(); <- Ensure that the seqcount update is visible + * update(tkf->base[1], tk); + * + * The reader side does: + * + * do { + * seq = tkf->seq; + * smp_rmb(); + * idx = seq & 0x01; + * now = now(tkf->base[idx]); + * smp_rmb(); + * } while (seq != tkf->seq) + * + * As long as we update base[0] readers are forced off to + * base[1]. Once base[0] is updated readers are redirected to base[0] + * and the base[1] update takes place. + * + * So if a NMI hits the update of base[0] then it will use base[1] + * which is still consistent. In the worst case this can result is a + * slightly wrong timestamp (a few nanoseconds). See + * @ktime_get_mono_fast_ns. + */ +static void update_fast_timekeeper(struct timekeeper *tk) +{ + struct tk_read_base *base = tk_fast_mono.base; + + /* Force readers off to base[1] */ + raw_write_seqcount_latch(&tk_fast_mono.seq); + + /* Update base[0] */ + memcpy(base, &tk->tkr, sizeof(*base)); + + /* Force readers back to base[0] */ + raw_write_seqcount_latch(&tk_fast_mono.seq); + + /* Update base[1] */ + memcpy(base + 1, base, sizeof(*base)); +} + +/** + * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic + * + * This timestamp is not guaranteed to be monotonic across an update. + * The timestamp is calculated by: + * + * now = base_mono + clock_delta * slope + * + * So if the update lowers the slope, readers who are forced to the + * not yet updated second array are still using the old steeper slope. + * + * tmono + * ^ + * | o n + * | o n + * | u + * | o + * |o + * |12345678---> reader order + * + * o = old slope + * u = update + * n = new slope + * + * So reader 6 will observe time going backwards versus reader 5. + * + * While other CPUs are likely to be able observe that, the only way + * for a CPU local observation is when an NMI hits in the middle of + * the update. Timestamps taken from that NMI context might be ahead + * of the following timestamps. Callers need to be aware of that and + * deal with it. + */ +u64 notrace ktime_get_mono_fast_ns(void) +{ + struct tk_read_base *tkr; + unsigned int seq; + u64 now; + + do { + seq = raw_read_seqcount(&tk_fast_mono.seq); + tkr = tk_fast_mono.base + (seq & 0x01); + now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr); + + } while (read_seqcount_retry(&tk_fast_mono.seq, seq)); + return now; +} +EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); + #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD static inline void update_vsyscall(struct timekeeper *tk) @@ -325,6 +447,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) if (action & TK_MIRROR) memcpy(&shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); + + update_fast_timekeeper(tk); } /** -- cgit v1.1 From 1b3e5c0936046e7e023149ddc8946d21c2ea20eb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jul 2014 21:05:25 +0000 Subject: ftrace: Provide trace clocks monotonic Expose the new NMI safe accessor to clock monotonic to the tracer. Signed-off-by: Thomas Gleixner Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Signed-off-by: John Stultz --- kernel/trace/trace.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f243444..84e2b45 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -806,11 +806,12 @@ static struct { const char *name; int in_ns; /* is this clock in nanoseconds? */ } trace_clocks[] = { - { trace_clock_local, "local", 1 }, - { trace_clock_global, "global", 1 }, - { trace_clock_counter, "counter", 0 }, - { trace_clock_jiffies, "uptime", 1 }, - { trace_clock, "perf", 1 }, + { trace_clock_local, "local", 1 }, + { trace_clock_global, "global", 1 }, + { trace_clock_counter, "counter", 0 }, + { trace_clock_jiffies, "uptime", 1 }, + { trace_clock, "perf", 1 }, + { ktime_get_mono_fast_ns, "mono", 1 }, ARCH_TRACE_CLOCKS }; -- cgit v1.1 From e2dff1ec0cc81fcf3e0696604bacc3e1c816538c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 23 Jul 2014 14:35:39 -0700 Subject: timekeeping: Minor fixup for timespec64->timespec assignment In the GENERIC_TIME_VSYSCALL_OLD update_vsyscall implementation, we take the tk_xtime() value, which returns a timespec64, and store it in a timespec. This luckily is ok, since the only architectures that use GENERIC_TIME_VSYSCALL_OLD are ia64 and ppc64, which are both 64 bit systems where timespec64 is the same as a timespec. Even so, for cleanliness reasons, use the conversion function to assign the proper type. Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8980fb7..2b56b95 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -338,7 +338,7 @@ static inline void update_vsyscall(struct timekeeper *tk) { struct timespec xt; - xt = tk_xtime(tk); + xt = timespec64_to_timespec(tk_xtime(tk)); update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult, tk->tkr.cycle_last); } -- cgit v1.1 From dc491596f6394382fbc74ad331156207d619fa0a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 6 Dec 2013 17:25:21 -0800 Subject: timekeeping: Rework frequency adjustments to work better w/ nohz The existing timekeeping_adjust logic has always been complicated to understand. Further, since it was developed prior to NOHZ becoming common, its not surprising it performs poorly when NOHZ is enabled. Since Miroslav pointed out the problematic nature of the existing code in the NOHZ case, I've tried to refactor the code to perform better. The problem with the previous approach was that it tried to adjust for the total cumulative error using a scaled dampening factor. This resulted in large errors to be corrected slowly, while small errors were corrected quickly. With NOHZ the timekeeping code doesn't know how far out the next tick will be, so this results in bad over-correction to small errors, and insufficient correction to large errors. Inspired by Miroslav's patch, I've refactored the code to try to address the correction in two steps. 1) Check the future freq error for the next tick, and if the frequency error is large, try to make sure we correct it so it doesn't cause much accumulated error. 2) Then make a small single unit adjustment to correct any cumulative error that has collected over time. This method performs fairly well in the simulator Miroslav created. Major credit to Miroslav for pointing out the issue, providing the original patch to resolve this, a simulator for testing, as well as helping debug and resolve issues in my implementation so that it performed closer to his original implementation. Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Reported-by: Miroslav Lichvar Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 193 ++++++++++++++++++++-------------------------- 1 file changed, 83 insertions(+), 110 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2b56b95..43c706a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -178,6 +178,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) * to counteract clock drifting. */ tk->tkr.mult = clock->mult; + tk->ntp_err_mult = 0; } /* Timekeeper helper functions. */ @@ -1257,125 +1258,34 @@ static int __init timekeeping_init_ops(void) register_syscore_ops(&timekeeping_syscore_ops); return 0; } - device_initcall(timekeeping_init_ops); /* - * If the error is already larger, we look ahead even further - * to compensate for late or lost adjustments. - */ -static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, - s64 error, s64 *interval, - s64 *offset) -{ - s64 tick_error, i; - u32 look_ahead, adj; - s32 error2, mult; - - /* - * Use the current error value to determine how much to look ahead. - * The larger the error the slower we adjust for it to avoid problems - * with losing too many ticks, otherwise we would overadjust and - * produce an even larger error. The smaller the adjustment the - * faster we try to adjust for it, as lost ticks can do less harm - * here. This is tuned so that an error of about 1 msec is adjusted - * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). - */ - error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); - error2 = abs(error2); - for (look_ahead = 0; error2 > 0; look_ahead++) - error2 >>= 2; - - /* - * Now calculate the error in (1 << look_ahead) ticks, but first - * remove the single look ahead already included in the error. - */ - tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1); - tick_error -= tk->xtime_interval >> 1; - error = ((error - tick_error) >> look_ahead) + tick_error; - - /* Finally calculate the adjustment shift value. */ - i = *interval; - mult = 1; - if (error < 0) { - error = -error; - *interval = -*interval; - *offset = -*offset; - mult = -1; - } - for (adj = 0; error > i; adj++) - error >>= 1; - - *interval <<= adj; - *offset <<= adj; - return mult << adj; -} - -/* - * Adjust the multiplier to reduce the error value, - * this is optimized for the most common adjustments of -1,0,1, - * for other values we can do a bit more work. + * Apply a multiplier adjustment to the timekeeper */ -static void timekeeping_adjust(struct timekeeper *tk, s64 offset) +static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, + s64 offset, + bool negative, + int adj_scale) { - s64 error, interval = tk->cycle_interval; - int adj; + s64 interval = tk->cycle_interval; + s32 mult_adj = 1; - /* - * The point of this is to check if the error is greater than half - * an interval. - * - * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. - * - * Note we subtract one in the shift, so that error is really error*2. - * This "saves" dividing(shifting) interval twice, but keeps the - * (error > interval) comparison as still measuring if error is - * larger than half an interval. - * - * Note: It does not "save" on aggravation when reading the code. - */ - error = tk->ntp_error >> (tk->ntp_error_shift - 1); - if (error > interval) { - /* - * We now divide error by 4(via shift), which checks if - * the error is greater than twice the interval. - * If it is greater, we need a bigadjust, if its smaller, - * we can adjust by 1. - */ - error >>= 2; - if (likely(error <= interval)) - adj = 1; - else - adj = timekeeping_bigadjust(tk, error, &interval, &offset); - } else { - if (error < -interval) { - /* See comment above, this is just switched for the negative */ - error >>= 2; - if (likely(error >= -interval)) { - adj = -1; - interval = -interval; - offset = -offset; - } else { - adj = timekeeping_bigadjust(tk, error, &interval, &offset); - } - } else { - goto out_adjust; - } + if (negative) { + mult_adj = -mult_adj; + interval = -interval; + offset = -offset; } + mult_adj <<= adj_scale; + interval <<= adj_scale; + offset <<= adj_scale; - if (unlikely(tk->tkr.clock->maxadj && - (tk->tkr.mult + adj > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) { - printk_deferred_once(KERN_WARNING - "Adjusting %s more than 11%% (%ld vs %ld)\n", - tk->tkr.clock->name, (long)tk->tkr.mult + adj, - (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj); - } /* * So the following can be confusing. * - * To keep things simple, lets assume adj == 1 for now. + * To keep things simple, lets assume mult_adj == 1 for now. * - * When adj != 1, remember that the interval and offset values + * When mult_adj != 1, remember that the interval and offset values * have been appropriately scaled so the math is the same. * * The basic idea here is that we're increasing the multiplier @@ -1419,12 +1329,76 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) * * XXX - TODO: Doc ntp_error calculation. */ - tk->tkr.mult += adj; + tk->tkr.mult += mult_adj; tk->xtime_interval += interval; tk->tkr.xtime_nsec -= offset; tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; +} + +/* + * Calculate the multiplier adjustment needed to match the frequency + * specified by NTP + */ +static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, + s64 offset) +{ + s64 interval = tk->cycle_interval; + s64 xinterval = tk->xtime_interval; + s64 tick_error; + bool negative; + u32 adj; + + /* Remove any current error adj from freq calculation */ + if (tk->ntp_err_mult) + xinterval -= tk->cycle_interval; + + /* Calculate current error per tick */ + tick_error = ntp_tick_length() >> tk->ntp_error_shift; + tick_error -= (xinterval + tk->xtime_remainder); + + /* Don't worry about correcting it if its small */ + if (likely((tick_error >= 0) && (tick_error <= interval))) + return; + + /* preserve the direction of correction */ + negative = (tick_error < 0); + + /* Sort out the magnitude of the correction */ + tick_error = abs(tick_error); + for (adj = 0; tick_error > interval; adj++) + tick_error >>= 1; + + /* scale the corrections */ + timekeeping_apply_adjustment(tk, offset, negative, adj); +} + +/* + * Adjust the timekeeper's multiplier to the correct frequency + * and also to reduce the accumulated error value. + */ +static void timekeeping_adjust(struct timekeeper *tk, s64 offset) +{ + /* Correct for the current frequency error */ + timekeeping_freqadjust(tk, offset); + + /* Next make a small adjustment to fix any cumulative error */ + if (!tk->ntp_err_mult && (tk->ntp_error > 0)) { + tk->ntp_err_mult = 1; + timekeeping_apply_adjustment(tk, offset, 0, 0); + } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) { + /* Undo any existing error adjustment */ + timekeeping_apply_adjustment(tk, offset, 1, 0); + tk->ntp_err_mult = 0; + } + + if (unlikely(tk->tkr.clock->maxadj && + (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) { + printk_once(KERN_WARNING + "Adjusting %s more than 11%% (%ld vs %ld)\n", + tk->tkr.clock->name, (long)tk->tkr.mult, + (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj); + } -out_adjust: /* * It may be possible that when we entered this function, xtime_nsec * was very small. Further, if we're slightly speeding the clocksource @@ -1444,7 +1418,6 @@ out_adjust: tk->tkr.xtime_nsec = 0; tk->ntp_error += neg << tk->ntp_error_shift; } - } /** -- cgit v1.1 From 375f45b5b53a91dfa8f0c11328e0e044f82acbed Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 23 Apr 2014 20:53:29 -0700 Subject: timekeeping: Use cached ntp_tick_length when accumulating error By caching the ntp_tick_length() when we correct the frequency error, and then using that cached value to accumulate error, we avoid large initial errors when the tick length is changed. This makes convergence happen much faster in the simulator, since the initial error doesn't have to be slowly whittled away. This initially seems like an accounting error, but Miroslav pointed out that ntp_tick_length() can change mid-tick, so when we apply it in the error accumulation, we are applying any recent change to the entire tick. This approach chooses to apply changes in the ntp_tick_length() only to the next tick, which allows us to calculate the freq correction before using the new tick length, which avoids accummulating error. Credit to Miroslav for pointing this out and providing the original patch this functionality has been pulled out from, along with the rational. Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Reported-by: Miroslav Lichvar Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 43c706a..f36b028 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -171,6 +171,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) tk->ntp_error = 0; tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + tk->ntp_tick = ntpinterval << tk->ntp_error_shift; /* * The timekeeper keeps its own mult values for the currently @@ -1352,6 +1353,8 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, if (tk->ntp_err_mult) xinterval -= tk->cycle_interval; + tk->ntp_tick = ntp_tick_length(); + /* Calculate current error per tick */ tick_error = ntp_tick_length() >> tk->ntp_error_shift; tick_error -= (xinterval + tk->xtime_remainder); @@ -1497,7 +1500,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, tk->raw_time.tv_nsec = raw_nsecs; /* Accumulate error between NTP and clock interval */ - tk->ntp_error += ntp_tick_length() << shift; + tk->ntp_error += tk->ntp_tick << shift; tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << (tk->ntp_error_shift + shift); -- cgit v1.1 From f5bffecda951b59d0d3cdd616d68952abc52bc40 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 22 Jul 2014 23:01:58 -0700 Subject: net: filter: split filter.c into two files BPF is used in several kernel components. This split creates logical boundary between generic eBPF core and the rest kernel/bpf/core.c: eBPF interpreter net/core/filter.c: classic->eBPF converter, classic verifiers, socket filters This patch only moves functions. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/Makefile | 1 + kernel/bpf/Makefile | 1 + kernel/bpf/core.c | 536 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 538 insertions(+) create mode 100644 kernel/bpf/Makefile create mode 100644 kernel/bpf/core.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index f2a8b62..e7360b7 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -87,6 +87,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o +obj-$(CONFIG_NET) += bpf/ obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile new file mode 100644 index 0000000..6a71145 --- /dev/null +++ b/kernel/bpf/Makefile @@ -0,0 +1 @@ +obj-y := core.o diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c new file mode 100644 index 0000000..77a240a --- /dev/null +++ b/kernel/bpf/core.c @@ -0,0 +1,536 @@ +/* + * Linux Socket Filter - Kernel level socket filtering + * + * Based on the design of the Berkeley Packet Filter. The new + * internal format has been designed by PLUMgrid: + * + * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com + * + * Authors: + * + * Jay Schulist + * Alexei Starovoitov + * Daniel Borkmann + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Andi Kleen - Fix a few bad bugs and races. + * Kris Katterjohn - Added many additional checks in sk_chk_filter() + */ +#include +#include +#include + +/* Registers */ +#define BPF_R0 regs[BPF_REG_0] +#define BPF_R1 regs[BPF_REG_1] +#define BPF_R2 regs[BPF_REG_2] +#define BPF_R3 regs[BPF_REG_3] +#define BPF_R4 regs[BPF_REG_4] +#define BPF_R5 regs[BPF_REG_5] +#define BPF_R6 regs[BPF_REG_6] +#define BPF_R7 regs[BPF_REG_7] +#define BPF_R8 regs[BPF_REG_8] +#define BPF_R9 regs[BPF_REG_9] +#define BPF_R10 regs[BPF_REG_10] + +/* Named registers */ +#define DST regs[insn->dst_reg] +#define SRC regs[insn->src_reg] +#define FP regs[BPF_REG_FP] +#define ARG1 regs[BPF_REG_ARG1] +#define CTX regs[BPF_REG_CTX] +#define IMM insn->imm + +/* No hurry in this branch + * + * Exported for the bpf jit load helper. + */ +void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size) +{ + u8 *ptr = NULL; + + if (k >= SKF_NET_OFF) + ptr = skb_network_header(skb) + k - SKF_NET_OFF; + else if (k >= SKF_LL_OFF) + ptr = skb_mac_header(skb) + k - SKF_LL_OFF; + if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) + return ptr; + + return NULL; +} + +/* Base function for offset calculation. Needs to go into .text section, + * therefore keeping it non-static as well; will also be used by JITs + * anyway later on, so do not let the compiler omit it. + */ +noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return 0; +} + +/** + * __sk_run_filter - run a filter on a given context + * @ctx: buffer to run the filter on + * @insn: filter to apply + * + * Decode and apply filter instructions to the skb->data. Return length to + * keep, 0 for none. @ctx is the data we are operating on, @insn is the + * array of filter instructions. + */ +static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn) +{ + u64 stack[MAX_BPF_STACK / sizeof(u64)]; + u64 regs[MAX_BPF_REG], tmp; + static const void *jumptable[256] = { + [0 ... 255] = &&default_label, + /* Now overwrite non-defaults ... */ + /* 32 bit ALU operations */ + [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, + [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, + [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, + [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, + [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, + [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, + [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, + [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, + [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, + [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, + [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, + [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, + [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, + [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, + [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, + [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, + [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, + [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, + [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, + [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, + [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, + [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, + [BPF_ALU | BPF_NEG] = &&ALU_NEG, + [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, + [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, + /* 64 bit ALU operations */ + [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, + [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, + [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, + [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, + [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, + [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, + [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, + [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, + [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, + [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, + [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, + [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, + [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, + [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, + [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, + [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, + [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, + [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, + [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, + [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, + [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, + [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, + [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, + [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, + [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, + /* Call instruction */ + [BPF_JMP | BPF_CALL] = &&JMP_CALL, + /* Jumps */ + [BPF_JMP | BPF_JA] = &&JMP_JA, + [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, + [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, + [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, + [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, + [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, + [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, + [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, + [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, + [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, + [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, + [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, + [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, + [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, + [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, + /* Program return */ + [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, + /* Store instructions */ + [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, + [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, + [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, + [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, + [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, + [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, + [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, + [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, + [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, + [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, + /* Load instructions */ + [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, + [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, + [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, + [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, + [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, + [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, + [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, + [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, + [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, + [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, + }; + void *ptr; + int off; + +#define CONT ({ insn++; goto select_insn; }) +#define CONT_JMP ({ insn++; goto select_insn; }) + + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; + ARG1 = (u64) (unsigned long) ctx; + + /* Registers used in classic BPF programs need to be reset first. */ + regs[BPF_REG_A] = 0; + regs[BPF_REG_X] = 0; + +select_insn: + goto *jumptable[insn->code]; + + /* ALU */ +#define ALU(OPCODE, OP) \ + ALU64_##OPCODE##_X: \ + DST = DST OP SRC; \ + CONT; \ + ALU_##OPCODE##_X: \ + DST = (u32) DST OP (u32) SRC; \ + CONT; \ + ALU64_##OPCODE##_K: \ + DST = DST OP IMM; \ + CONT; \ + ALU_##OPCODE##_K: \ + DST = (u32) DST OP (u32) IMM; \ + CONT; + + ALU(ADD, +) + ALU(SUB, -) + ALU(AND, &) + ALU(OR, |) + ALU(LSH, <<) + ALU(RSH, >>) + ALU(XOR, ^) + ALU(MUL, *) +#undef ALU + ALU_NEG: + DST = (u32) -DST; + CONT; + ALU64_NEG: + DST = -DST; + CONT; + ALU_MOV_X: + DST = (u32) SRC; + CONT; + ALU_MOV_K: + DST = (u32) IMM; + CONT; + ALU64_MOV_X: + DST = SRC; + CONT; + ALU64_MOV_K: + DST = IMM; + CONT; + ALU64_ARSH_X: + (*(s64 *) &DST) >>= SRC; + CONT; + ALU64_ARSH_K: + (*(s64 *) &DST) >>= IMM; + CONT; + ALU64_MOD_X: + if (unlikely(SRC == 0)) + return 0; + tmp = DST; + DST = do_div(tmp, SRC); + CONT; + ALU_MOD_X: + if (unlikely(SRC == 0)) + return 0; + tmp = (u32) DST; + DST = do_div(tmp, (u32) SRC); + CONT; + ALU64_MOD_K: + tmp = DST; + DST = do_div(tmp, IMM); + CONT; + ALU_MOD_K: + tmp = (u32) DST; + DST = do_div(tmp, (u32) IMM); + CONT; + ALU64_DIV_X: + if (unlikely(SRC == 0)) + return 0; + do_div(DST, SRC); + CONT; + ALU_DIV_X: + if (unlikely(SRC == 0)) + return 0; + tmp = (u32) DST; + do_div(tmp, (u32) SRC); + DST = (u32) tmp; + CONT; + ALU64_DIV_K: + do_div(DST, IMM); + CONT; + ALU_DIV_K: + tmp = (u32) DST; + do_div(tmp, (u32) IMM); + DST = (u32) tmp; + CONT; + ALU_END_TO_BE: + switch (IMM) { + case 16: + DST = (__force u16) cpu_to_be16(DST); + break; + case 32: + DST = (__force u32) cpu_to_be32(DST); + break; + case 64: + DST = (__force u64) cpu_to_be64(DST); + break; + } + CONT; + ALU_END_TO_LE: + switch (IMM) { + case 16: + DST = (__force u16) cpu_to_le16(DST); + break; + case 32: + DST = (__force u32) cpu_to_le32(DST); + break; + case 64: + DST = (__force u64) cpu_to_le64(DST); + break; + } + CONT; + + /* CALL */ + JMP_CALL: + /* Function call scratches BPF_R1-BPF_R5 registers, + * preserves BPF_R6-BPF_R9, and stores return value + * into BPF_R0. + */ + BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, + BPF_R4, BPF_R5); + CONT; + + /* JMP */ + JMP_JA: + insn += insn->off; + CONT; + JMP_JEQ_X: + if (DST == SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JEQ_K: + if (DST == IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JNE_X: + if (DST != SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JNE_K: + if (DST != IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGT_X: + if (DST > SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGT_K: + if (DST > IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGE_X: + if (DST >= SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGE_K: + if (DST >= IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGT_X: + if (((s64) DST) > ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGT_K: + if (((s64) DST) > ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGE_X: + if (((s64) DST) >= ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGE_K: + if (((s64) DST) >= ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSET_X: + if (DST & SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSET_K: + if (DST & IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_EXIT: + return BPF_R0; + + /* STX and ST and LDX*/ +#define LDST(SIZEOP, SIZE) \ + STX_MEM_##SIZEOP: \ + *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ + CONT; \ + ST_MEM_##SIZEOP: \ + *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ + CONT; \ + LDX_MEM_##SIZEOP: \ + DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ + CONT; + + LDST(B, u8) + LDST(H, u16) + LDST(W, u32) + LDST(DW, u64) +#undef LDST + STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ + atomic_add((u32) SRC, (atomic_t *)(unsigned long) + (DST + insn->off)); + CONT; + STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ + atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) + (DST + insn->off)); + CONT; + LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ + off = IMM; +load_word: + /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are + * only appearing in the programs where ctx == + * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] + * == BPF_R6, sk_convert_filter() saves it in BPF_R6, + * internal BPF verifier will check that BPF_R6 == + * ctx. + * + * BPF_ABS and BPF_IND are wrappers of function calls, + * so they scratch BPF_R1-BPF_R5 registers, preserve + * BPF_R6-BPF_R9, and store return value into BPF_R0. + * + * Implicit input: + * ctx == skb == BPF_R6 == CTX + * + * Explicit input: + * SRC == any register + * IMM == 32-bit immediate + * + * Output: + * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness + */ + + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = get_unaligned_be32(ptr); + CONT; + } + + return 0; + LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ + off = IMM; +load_half: + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = get_unaligned_be16(ptr); + CONT; + } + + return 0; + LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ + off = IMM; +load_byte: + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = *(u8 *)ptr; + CONT; + } + + return 0; + LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ + off = IMM + SRC; + goto load_word; + LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ + off = IMM + SRC; + goto load_half; + LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ + off = IMM + SRC; + goto load_byte; + + default_label: + /* If we ever reach this, we have a bug somewhere. */ + WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); + return 0; +} + +void __weak bpf_int_jit_compile(struct sk_filter *prog) +{ +} + +/** + * sk_filter_select_runtime - select execution runtime for BPF program + * @fp: sk_filter populated with internal BPF program + * + * try to JIT internal BPF program, if JIT is not available select interpreter + * BPF program will be executed via SK_RUN_FILTER() macro + */ +void sk_filter_select_runtime(struct sk_filter *fp) +{ + fp->bpf_func = (void *) __sk_run_filter; + + /* Probe if internal BPF can be JITed */ + bpf_int_jit_compile(fp); +} +EXPORT_SYMBOL_GPL(sk_filter_select_runtime); + +/* free internal BPF program */ +void sk_filter_free(struct sk_filter *fp) +{ + bpf_jit_free(fp); +} +EXPORT_SYMBOL_GPL(sk_filter_free); -- cgit v1.1 From 7d8b6c63751cfbbe5eef81a48c22978b3407a3ad Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 23 Jul 2014 15:36:26 -0400 Subject: CAPABILITIES: remove undefined caps from all processes This is effectively a revert of 7b9a7ec565505699f503b4fcf61500dceb36e744 plus fixing it a different way... We found, when trying to run an application from an application which had dropped privs that the kernel does security checks on undefined capability bits. This was ESPECIALLY difficult to debug as those undefined bits are hidden from /proc/$PID/status. Consider a root application which drops all capabilities from ALL 4 capability sets. We assume, since the application is going to set eff/perm/inh from an array that it will clear not only the defined caps less than CAP_LAST_CAP, but also the higher 28ish bits which are undefined future capabilities. The BSET gets cleared differently. Instead it is cleared one bit at a time. The problem here is that in security/commoncap.c::cap_task_prctl() we actually check the validity of a capability being read. So any task which attempts to 'read all things set in bset' followed by 'unset all things set in bset' will not even attempt to unset the undefined bits higher than CAP_LAST_CAP. So the 'parent' will look something like: CapInh: 0000000000000000 CapPrm: 0000000000000000 CapEff: 0000000000000000 CapBnd: ffffffc000000000 All of this 'should' be fine. Given that these are undefined bits that aren't supposed to have anything to do with permissions. But they do... So lets now consider a task which cleared the eff/perm/inh completely and cleared all of the valid caps in the bset (but not the invalid caps it couldn't read out of the kernel). We know that this is exactly what the libcap-ng library does and what the go capabilities library does. They both leave you in that above situation if you try to clear all of you capapabilities from all 4 sets. If that root task calls execve() the child task will pick up all caps not blocked by the bset. The bset however does not block bits higher than CAP_LAST_CAP. So now the child task has bits in eff which are not in the parent. These are 'meaningless' undefined bits, but still bits which the parent doesn't have. The problem is now in cred_cap_issubset() (or any operation which does a subset test) as the child, while a subset for valid cap bits, is not a subset for invalid cap bits! So now we set durring commit creds that the child is not dumpable. Given it is 'more priv' than its parent. It also means the parent cannot ptrace the child and other stupidity. The solution here: 1) stop hiding capability bits in status This makes debugging easier! 2) stop giving any task undefined capability bits. it's simple, it you don't put those invalid bits in CAP_FULL_SET you won't get them in init and you won't get them in any other task either. This fixes the cap_issubset() tests and resulting fallout (which made the init task in a docker container untraceable among other things) 3) mask out undefined bits when sys_capset() is called as it might use ~0, ~0 to denote 'all capabilities' for backward/forward compatibility. This lets 'capsh --caps="all=eip" -- -c /bin/bash' run. 4) mask out undefined bit when we read a file capability off of disk as again likely all bits are set in the xattr for forward/backward compatibility. This lets 'setcap all+pe /bin/bash; /bin/bash' run Signed-off-by: Eric Paris Reviewed-by: Kees Cook Cc: Andrew Vagin Cc: Andrew G. Morgan Cc: Serge E. Hallyn Cc: Kees Cook Cc: Steve Grubb Cc: Dan Walsh Cc: stable@vger.kernel.org Signed-off-by: James Morris --- kernel/audit.c | 2 +- kernel/capability.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 3ef2e0e..ba2ff5a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1677,7 +1677,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) audit_log_format(ab, " %s=", prefix); CAP_FOR_EACH_U32(i) { audit_log_format(ab, "%08x", - cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); + cap->cap[CAP_LAST_U32 - i]); } } diff --git a/kernel/capability.c b/kernel/capability.c index a5cf13c..989f5bf 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -258,6 +258,10 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) i++; } + effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + new = prepare_creds(); if (!new) return -ENOMEM; -- cgit v1.1 From 4fae4e7624653ef498d0e2a38f00620b9701ab04 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jul 2014 15:39:21 +0200 Subject: irq: Warn when shared interrupts do not match on NO_SUSPEND When suspend_device_irqs() iterates all descriptors, its pointless if one has NO_SUSPEND set while another has not. Validate on request_irq() that NO_SUSPEND state maches for SHARED interrupts. Signed-off-by: Peter Zijlstra Acked-by: "Rafael J. Wysocki" Link: http://lkml.kernel.org/r/20140724133921.GY6758@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 88657d7..27a1fe0 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1077,9 +1077,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * set the trigger type must match. Also all must * agree on ONESHOT. */ + +#define IRQF_MISMATCH \ + (IRQF_TRIGGER_MASK | IRQF_ONESHOT | IRQF_NO_SUSPEND) + if (!((old->flags & new->flags) & IRQF_SHARED) || - ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || - ((old->flags ^ new->flags) & IRQF_ONESHOT)) + ((old->flags ^ new->flags) & IRQF_MISMATCH)) goto mismatch; /* All handlers must agree on per-cpuness */ -- cgit v1.1 From 2695fb552cbef1029aa025a98acb80cc51d66de5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 24 Jul 2014 16:38:21 -0700 Subject: net: filter: rename 'struct sock_filter_int' into 'struct bpf_insn' eBPF is used by socket filtering, seccomp and soon by tracing and exposed to userspace, therefore 'sock_filter_int' name is not accurate. Rename it to 'bpf_insn' Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 2 +- kernel/seccomp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 77a240a..265a02c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -81,7 +81,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) * keep, 0 for none. @ctx is the data we are operating on, @insn is the * array of filter instructions. */ -static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn) +static unsigned int __sk_run_filter(void *ctx, const struct bpf_insn *insn) { u64 stack[MAX_BPF_STACK / sizeof(u64)]; u64 regs[MAX_BPF_REG], tmp; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 301bbc2..565743d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -248,7 +248,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) if (ret) goto free_prog; - /* Convert 'sock_filter' insns to 'sock_filter_int' insns */ + /* Convert 'sock_filter' insns to 'bpf_insn' insns */ ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); if (ret) goto free_prog; -- cgit v1.1 From 9b20a352d78a7651aa68a9220f77ccb03009d892 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Sun, 27 Jul 2014 07:24:01 +0930 Subject: module: add within_module() function It is just a small optimization that allows to replace few occurrences of within_module_init() || within_module_core() with a single call. Signed-off-by: Petr Mladek Signed-off-by: Rusty Russell --- kernel/module.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 81e727c..e87fdd2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3448,8 +3448,7 @@ const char *module_address_lookup(unsigned long addr, list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { + if (within_module(addr, mod)) { if (modname) *modname = mod->name; ret = get_ksymbol(mod, addr, size, offset); @@ -3473,8 +3472,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { + if (within_module(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, NULL, NULL); @@ -3499,8 +3497,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { + if (within_module(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, size, offset); @@ -3764,8 +3761,7 @@ struct module *__module_address(unsigned long addr) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_core(addr, mod) - || within_module_init(addr, mod)) + if (within_module(addr, mod)) return mod; } return NULL; -- cgit v1.1 From 2e3a10a1551d6ceea005e6a62ca58183b8976217 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 27 Jul 2014 07:29:01 +0930 Subject: ARM: avoid ARM binutils leaking ELF local symbols Symbols starting with .L are ELF local symbols and should not appear in ELF symbol tables. However, unfortunately ARM binutils leaks the .LANCHOR symbols into the symbol table, which leads kallsyms to report these symbols rather than the real name. It is not very useful when %pf reports symbols against these leaked .LANCHOR symbols. Arrange for kallsyms to ignore these symbols using the same mechanism that is used for the ARM mapping symbols. Signed-off-by: Russell King Signed-off-by: Rusty Russell --- kernel/module.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e87fdd2..cd9bce9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3385,6 +3385,8 @@ static inline int within(unsigned long addr, void *start, unsigned long size) */ static inline int is_arm_mapping_symbol(const char *str) { + if (str[0] == '.' && str[1] == 'L') + return true; return str[0] == '$' && strchr("atd", str[1]) && (str[2] == '\0' || str[2] == '.'); } -- cgit v1.1 From f469f02dc6fa67f6c6a7d91400d08b9339147aed Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 21 Jul 2014 12:26:57 +0200 Subject: PM / Hibernate: Create a Radix-Tree to store memory bitmap This patch adds the code to allocate and build the radix tree to store the memory bitmap. The old data structure is left in place until the radix tree implementation is finished. Signed-off-by: Joerg Roedel Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 223 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 222 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1ea328a..5a0eafd 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -248,11 +248,24 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) * information is stored (in the form of a block of bitmap) * It also contains the pfns that correspond to the start and end of * the represented memory area. + * + * The memory bitmap is organized as a radix tree to guarantee fast random + * access to the bits. There is one radix tree for each zone (as returned + * from create_mem_extents). + * + * One radix tree is represented by one struct mem_zone_bm_rtree. There are + * two linked lists for the nodes of the tree, one for the inner nodes and + * one for the leave nodes. The linked leave nodes are used for fast linear + * access of the memory bitmap. + * + * The struct rtree_node represents one node of the radix tree. */ #define BM_END_OF_MAP (~0UL) #define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) +#define BM_BLOCK_SHIFT (PAGE_SHIFT + 3) +#define BM_BLOCK_MASK ((1UL << BM_BLOCK_SHIFT) - 1) struct bm_block { struct list_head hook; /* hook into a list of bitmap blocks */ @@ -266,6 +279,31 @@ static inline unsigned long bm_block_bits(struct bm_block *bb) return bb->end_pfn - bb->start_pfn; } +/* + * struct rtree_node is a wrapper struct to link the nodes + * of the rtree together for easy linear iteration over + * bits and easy freeing + */ +struct rtree_node { + struct list_head list; + unsigned long *data; +}; + +/* + * struct mem_zone_bm_rtree represents a bitmap used for one + * populated memory zone. + */ +struct mem_zone_bm_rtree { + struct list_head list; /* Link Zones together */ + struct list_head nodes; /* Radix Tree inner nodes */ + struct list_head leaves; /* Radix Tree leaves */ + unsigned long start_pfn; /* Zone start page frame */ + unsigned long end_pfn; /* Zone end page frame + 1 */ + struct rtree_node *rtree; /* Radix Tree Root */ + int levels; /* Number of Radix Tree Levels */ + unsigned int blocks; /* Number of Bitmap Blocks */ +}; + /* strcut bm_position is used for browsing memory bitmaps */ struct bm_position { @@ -274,6 +312,7 @@ struct bm_position { }; struct memory_bitmap { + struct list_head zones; struct list_head blocks; /* list of bitmap blocks */ struct linked_page *p_list; /* list of pages used to store zone * bitmap objects and bitmap block @@ -284,6 +323,166 @@ struct memory_bitmap { /* Functions that operate on memory bitmaps */ +#define BM_ENTRIES_PER_LEVEL (PAGE_SIZE / sizeof(unsigned long)) +#if BITS_PER_LONG == 32 +#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 2) +#else +#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 3) +#endif +#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1) + +/* + * alloc_rtree_node - Allocate a new node and add it to the radix tree. + * + * This function is used to allocate inner nodes as well as the + * leave nodes of the radix tree. It also adds the node to the + * corresponding linked list passed in by the *list parameter. + */ +static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, + struct chain_allocator *ca, + struct list_head *list) +{ + struct rtree_node *node; + + node = chain_alloc(ca, sizeof(struct rtree_node)); + if (!node) + return NULL; + + node->data = get_image_page(gfp_mask, safe_needed); + if (!node->data) + return NULL; + + list_add_tail(&node->list, list); + + return node; +} + +/* + * add_rtree_block - Add a new leave node to the radix tree + * + * The leave nodes need to be allocated in order to keep the leaves + * linked list in order. This is guaranteed by the zone->blocks + * counter. + */ +static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, + int safe_needed, struct chain_allocator *ca) +{ + struct rtree_node *node, *block, **dst; + unsigned int levels_needed, block_nr; + int i; + + block_nr = zone->blocks; + levels_needed = 0; + + /* How many levels do we need for this block nr? */ + while (block_nr) { + levels_needed += 1; + block_nr >>= BM_RTREE_LEVEL_SHIFT; + } + + /* Make sure the rtree has enough levels */ + for (i = zone->levels; i < levels_needed; i++) { + node = alloc_rtree_node(gfp_mask, safe_needed, ca, + &zone->nodes); + if (!node) + return -ENOMEM; + + node->data[0] = (unsigned long)zone->rtree; + zone->rtree = node; + zone->levels += 1; + } + + /* Allocate new block */ + block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves); + if (!block) + return -ENOMEM; + + /* Now walk the rtree to insert the block */ + node = zone->rtree; + dst = &zone->rtree; + block_nr = zone->blocks; + for (i = zone->levels; i > 0; i--) { + int index; + + if (!node) { + node = alloc_rtree_node(gfp_mask, safe_needed, ca, + &zone->nodes); + if (!node) + return -ENOMEM; + *dst = node; + } + + index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); + index &= BM_RTREE_LEVEL_MASK; + dst = (struct rtree_node **)&((*dst)->data[index]); + node = *dst; + } + + zone->blocks += 1; + *dst = block; + + return 0; +} + +static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, + int clear_nosave_free); + +/* + * create_zone_bm_rtree - create a radix tree for one zone + * + * Allocated the mem_zone_bm_rtree structure and initializes it. + * This function also allocated and builds the radix tree for the + * zone. + */ +static struct mem_zone_bm_rtree * +create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, + struct chain_allocator *ca, + unsigned long start, unsigned long end) +{ + struct mem_zone_bm_rtree *zone; + unsigned int i, nr_blocks; + unsigned long pages; + + pages = end - start; + zone = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree)); + if (!zone) + return NULL; + + INIT_LIST_HEAD(&zone->nodes); + INIT_LIST_HEAD(&zone->leaves); + zone->start_pfn = start; + zone->end_pfn = end; + nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); + + for (i = 0; i < nr_blocks; i++) { + if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) { + free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR); + return NULL; + } + } + + return zone; +} + +/* + * free_zone_bm_rtree - Free the memory of the radix tree + * + * Free all node pages of the radix tree. The mem_zone_bm_rtree + * structure itself is not freed here nor are the rtree_node + * structs. + */ +static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, + int clear_nosave_free) +{ + struct rtree_node *node; + + list_for_each_entry(node, &zone->nodes, list) + free_image_page(node->data, clear_nosave_free); + + list_for_each_entry(node, &zone->leaves, list) + free_image_page(node->data, clear_nosave_free); +} + static void memory_bm_position_reset(struct memory_bitmap *bm) { bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); @@ -408,12 +607,14 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) chain_init(&ca, gfp_mask, safe_needed); INIT_LIST_HEAD(&bm->blocks); + INIT_LIST_HEAD(&bm->zones); error = create_mem_extents(&mem_extents, gfp_mask); if (error) return error; list_for_each_entry(ext, &mem_extents, hook) { + struct mem_zone_bm_rtree *zone; struct bm_block *bb; unsigned long pfn = ext->start; unsigned long pages = ext->end - ext->start; @@ -441,6 +642,12 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) } bb->end_pfn = pfn; } + + zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca, + ext->start, ext->end); + if (!zone) + goto Error; + list_add_tail(&zone->list, &bm->zones); } bm->p_list = ca.chain; @@ -460,14 +667,19 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) */ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) { + struct mem_zone_bm_rtree *zone; struct bm_block *bb; list_for_each_entry(bb, &bm->blocks, hook) if (bb->data) free_image_page(bb->data, clear_nosave_free); + list_for_each_entry(zone, &bm->zones, list) + free_zone_bm_rtree(zone, clear_nosave_free); + free_list_of_pages(bm->p_list, clear_nosave_free); + INIT_LIST_HEAD(&bm->zones); INIT_LIST_HEAD(&bm->blocks); } @@ -816,12 +1028,21 @@ void free_basic_memory_bitmaps(void) unsigned int snapshot_additional_pages(struct zone *zone) { + unsigned int rtree, nodes; unsigned int res; res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); res += DIV_ROUND_UP(res * sizeof(struct bm_block), LINKED_PAGE_DATA_SIZE); - return 2 * res; + rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); + rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node), + LINKED_PAGE_DATA_SIZE); + while (nodes > 1) { + nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL); + rtree += nodes; + } + + return 2 * (res + rtree); } #ifdef CONFIG_HIGHMEM -- cgit v1.1 From 07a338236fdcd6caf41541dcdf879f5758020ab1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 21 Jul 2014 12:26:58 +0200 Subject: PM / Hibernate: Add memory_rtree_find_bit function Add a function to find a bit in the radix tree for a given pfn. Also add code to the memory bitmap wrapper functions to use the radix tree together with the existing memory bitmap implementation. On read accesses compare the results of both bitmaps to make sure the radix tree behaves the same way. Signed-off-by: Joerg Roedel Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5a0eafd..0b7f934 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -720,6 +720,56 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, return 0; } +/* + * memory_rtree_find_bit - Find the bit for pfn in the memory + * bitmap + * + * Walks the radix tree to find the page which contains the bit for + * pfn and returns the bit position in **addr and *bit_nr. + */ +static int memory_rtree_find_bit(struct memory_bitmap *bm, unsigned long pfn, + void **addr, unsigned int *bit_nr) +{ + struct mem_zone_bm_rtree *curr, *zone; + struct rtree_node *node; + int i, block_nr; + + zone = NULL; + + /* Find the right zone */ + list_for_each_entry(curr, &bm->zones, list) { + if (pfn >= curr->start_pfn && pfn < curr->end_pfn) { + zone = curr; + break; + } + } + + if (!zone) + return -EFAULT; + + /* + * We have a zone. Now walk the radix tree to find the leave + * node for our pfn. + */ + node = zone->rtree; + block_nr = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT; + + for (i = zone->levels; i > 0; i--) { + int index; + + index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); + index &= BM_RTREE_LEVEL_MASK; + BUG_ON(node->data[index] == 0); + node = (struct rtree_node *)node->data[index]; + } + + /* Set return values */ + *addr = node->data; + *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK; + + return 0; +} + static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; @@ -729,6 +779,10 @@ static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) error = memory_bm_find_bit(bm, pfn, &addr, &bit); BUG_ON(error); set_bit(bit, addr); + + error = memory_rtree_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); + set_bit(bit, addr); } static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) @@ -740,6 +794,13 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) error = memory_bm_find_bit(bm, pfn, &addr, &bit); if (!error) set_bit(bit, addr); + else + return error; + + error = memory_rtree_find_bit(bm, pfn, &addr, &bit); + if (!error) + set_bit(bit, addr); + return error; } @@ -752,25 +813,42 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) error = memory_bm_find_bit(bm, pfn, &addr, &bit); BUG_ON(error); clear_bit(bit, addr); + + error = memory_rtree_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); + clear_bit(bit, addr); } static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; - int error; + int error, error2; + int v; error = memory_bm_find_bit(bm, pfn, &addr, &bit); BUG_ON(error); - return test_bit(bit, addr); + v = test_bit(bit, addr); + + error2 = memory_rtree_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error2); + + WARN_ON_ONCE(v != test_bit(bit, addr)); + + return v; } static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; + int present; + + present = !memory_bm_find_bit(bm, pfn, &addr, &bit); + + WARN_ON_ONCE(present != !memory_rtree_find_bit(bm, pfn, &addr, &bit)); - return !memory_bm_find_bit(bm, pfn, &addr, &bit); + return present; } /** -- cgit v1.1 From 3a20cb1779616ebcaade393cc9beac0e03cbffef Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 21 Jul 2014 12:26:59 +0200 Subject: PM / Hibernate: Implement position keeping in radix tree Add code to remember the last position that was requested in the radix tree. Use it as a cache for faster linear walking of the bitmap in the memory_bm_rtree_next_pfn() function which is also added with this patch. Signed-off-by: Joerg Roedel Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0b7f934..802f241 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -309,6 +309,11 @@ struct mem_zone_bm_rtree { struct bm_position { struct bm_block *block; int bit; + + struct mem_zone_bm_rtree *zone; + struct rtree_node *node; + unsigned long node_pfn; + int node_bit; }; struct memory_bitmap { @@ -487,6 +492,13 @@ static void memory_bm_position_reset(struct memory_bitmap *bm) { bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); bm->cur.bit = 0; + + bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, + list); + bm->cur.node = list_entry(bm->cur.zone->leaves.next, + struct rtree_node, list); + bm->cur.node_pfn = 0; + bm->cur.node_bit = 0; } static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); @@ -734,6 +746,11 @@ static int memory_rtree_find_bit(struct memory_bitmap *bm, unsigned long pfn, struct rtree_node *node; int i, block_nr; + zone = bm->cur.zone; + + if (pfn >= zone->start_pfn && pfn < zone->end_pfn) + goto zone_found; + zone = NULL; /* Find the right zone */ @@ -747,10 +764,16 @@ static int memory_rtree_find_bit(struct memory_bitmap *bm, unsigned long pfn, if (!zone) return -EFAULT; +zone_found: /* * We have a zone. Now walk the radix tree to find the leave * node for our pfn. */ + + node = bm->cur.node; + if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) + goto node_found; + node = zone->rtree; block_nr = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT; @@ -763,6 +786,12 @@ static int memory_rtree_find_bit(struct memory_bitmap *bm, unsigned long pfn, node = (struct rtree_node *)node->data[index]; } +node_found: + /* Update last position */ + bm->cur.zone = zone; + bm->cur.node = node; + bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; + /* Set return values */ *addr = node->data; *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK; @@ -860,11 +889,16 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) * this function. */ +static unsigned long memory_bm_rtree_next_pfn(struct memory_bitmap *bm); + static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { + unsigned long rtree_pfn; struct bm_block *bb; int bit; + rtree_pfn = memory_bm_rtree_next_pfn(bm); + bb = bm->cur.block; do { bit = bm->cur.bit; @@ -878,13 +912,77 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) } while (&bb->hook != &bm->blocks); memory_bm_position_reset(bm); + WARN_ON_ONCE(rtree_pfn != BM_END_OF_MAP); return BM_END_OF_MAP; Return_pfn: + WARN_ON_ONCE(bb->start_pfn + bit != rtree_pfn); bm->cur.bit = bit + 1; return bb->start_pfn + bit; } +/* + * rtree_next_node - Jumps to the next leave node + * + * Sets the position to the beginning of the next node in the + * memory bitmap. This is either the next node in the current + * zone's radix tree or the first node in the radix tree of the + * next zone. + * + * Returns true if there is a next node, false otherwise. + */ +static bool rtree_next_node(struct memory_bitmap *bm) +{ + bm->cur.node = list_entry(bm->cur.node->list.next, + struct rtree_node, list); + if (&bm->cur.node->list != &bm->cur.zone->leaves) { + bm->cur.node_pfn += BM_BITS_PER_BLOCK; + bm->cur.node_bit = 0; + return true; + } + + /* No more nodes, goto next zone */ + bm->cur.zone = list_entry(bm->cur.zone->list.next, + struct mem_zone_bm_rtree, list); + if (&bm->cur.zone->list != &bm->zones) { + bm->cur.node = list_entry(bm->cur.zone->leaves.next, + struct rtree_node, list); + bm->cur.node_pfn = 0; + bm->cur.node_bit = 0; + return true; + } + + /* No more zones */ + return false; +} + +/* + * memory_bm_rtree_next_pfn - Find the next set bit + * + * Starting from the last returned position this function searches + * for the next set bit in the memory bitmap and returns its + * number. If no more bit is set BM_END_OF_MAP is returned. + */ +static unsigned long memory_bm_rtree_next_pfn(struct memory_bitmap *bm) +{ + unsigned long bits, pfn, pages; + int bit; + + do { + pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn; + bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK); + bit = find_next_bit(bm->cur.node->data, bits, + bm->cur.node_bit); + if (bit < bits) { + pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit; + bm->cur.node_bit = bit + 1; + return pfn; + } + } while (rtree_next_node(bm)); + + return BM_END_OF_MAP; +} + /** * This structure represents a range of page frames the contents of which * should not be saved during the suspend. -- cgit v1.1 From 6efde38f07690652bf0d93f5e4f1a5f496574806 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 21 Jul 2014 12:27:00 +0200 Subject: PM / Hibernate: Iterate over set bits instead of PFNs in swsusp_free() The existing implementation of swsusp_free iterates over all pfns in the system and checks every bit in the two memory bitmaps. This doesn't scale very well with large numbers of pfns, especially when the bitmaps are not populated very densly. Change the algorithm to iterate over the set bits in the bitmaps instead to make it scale better in large memory configurations. Also add a memory_bm_clear_current() helper function that clears the bit for the last position returned from the memory bitmap. Signed-off-by: Joerg Roedel Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 53 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 802f241..5b71caf 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -848,6 +848,17 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) clear_bit(bit, addr); } +static void memory_bm_clear_current(struct memory_bitmap *bm) +{ + int bit; + + bit = max(bm->cur.node_bit - 1, 0); + clear_bit(bit, bm->cur.node->data); + + bit = max(bm->cur.bit - 1, 0); + clear_bit(bit, bm->cur.block->data); +} + static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; @@ -1491,23 +1502,35 @@ static struct memory_bitmap copy_bm; void swsusp_free(void) { - struct zone *zone; - unsigned long pfn, max_zone_pfn; + unsigned long fb_pfn, fr_pfn; - for_each_populated_zone(zone) { - max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - - if (swsusp_page_is_forbidden(page) && - swsusp_page_is_free(page)) { - swsusp_unset_page_forbidden(page); - swsusp_unset_page_free(page); - __free_page(page); - } - } + memory_bm_position_reset(forbidden_pages_map); + memory_bm_position_reset(free_pages_map); + +loop: + fr_pfn = memory_bm_next_pfn(free_pages_map); + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + + /* + * Find the next bit set in both bitmaps. This is guaranteed to + * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP. + */ + do { + if (fb_pfn < fr_pfn) + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + if (fr_pfn < fb_pfn) + fr_pfn = memory_bm_next_pfn(free_pages_map); + } while (fb_pfn != fr_pfn); + + if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { + struct page *page = pfn_to_page(fr_pfn); + + memory_bm_clear_current(forbidden_pages_map); + memory_bm_clear_current(free_pages_map); + __free_page(page); + goto loop; } + nr_copy_pages = 0; nr_meta_pages = 0; restore_pblist = NULL; -- cgit v1.1 From 9047eb629e5cd25ae3834d8c62ae02eb8c32bc17 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 21 Jul 2014 12:27:01 +0200 Subject: PM / Hibernate: Remove the old memory-bitmap implementation The radix tree implementatio is proved to work the same as the old implementation now. So the old implementation can be removed to finish the switch to the radix tree for the memory bitmaps. Signed-off-by: Joerg Roedel Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 223 +++++------------------------------------------- 1 file changed, 21 insertions(+), 202 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5b71caf..ab1998a 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -267,18 +267,6 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) #define BM_BLOCK_SHIFT (PAGE_SHIFT + 3) #define BM_BLOCK_MASK ((1UL << BM_BLOCK_SHIFT) - 1) -struct bm_block { - struct list_head hook; /* hook into a list of bitmap blocks */ - unsigned long start_pfn; /* pfn represented by the first bit */ - unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ - unsigned long *data; /* bitmap representing pages */ -}; - -static inline unsigned long bm_block_bits(struct bm_block *bb) -{ - return bb->end_pfn - bb->start_pfn; -} - /* * struct rtree_node is a wrapper struct to link the nodes * of the rtree together for easy linear iteration over @@ -307,9 +295,6 @@ struct mem_zone_bm_rtree { /* strcut bm_position is used for browsing memory bitmaps */ struct bm_position { - struct bm_block *block; - int bit; - struct mem_zone_bm_rtree *zone; struct rtree_node *node; unsigned long node_pfn; @@ -318,7 +303,6 @@ struct bm_position { struct memory_bitmap { struct list_head zones; - struct list_head blocks; /* list of bitmap blocks */ struct linked_page *p_list; /* list of pages used to store zone * bitmap objects and bitmap block * objects @@ -490,9 +474,6 @@ static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, static void memory_bm_position_reset(struct memory_bitmap *bm) { - bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); - bm->cur.bit = 0; - bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, list); bm->cur.node = list_entry(bm->cur.zone->leaves.next, @@ -503,30 +484,6 @@ static void memory_bm_position_reset(struct memory_bitmap *bm) static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); -/** - * create_bm_block_list - create a list of block bitmap objects - * @pages - number of pages to track - * @list - list to put the allocated blocks into - * @ca - chain allocator to be used for allocating memory - */ -static int create_bm_block_list(unsigned long pages, - struct list_head *list, - struct chain_allocator *ca) -{ - unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); - - while (nr_blocks-- > 0) { - struct bm_block *bb; - - bb = chain_alloc(ca, sizeof(struct bm_block)); - if (!bb) - return -ENOMEM; - list_add(&bb->hook, list); - } - - return 0; -} - struct mem_extent { struct list_head hook; unsigned long start; @@ -618,7 +575,6 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) int error; chain_init(&ca, gfp_mask, safe_needed); - INIT_LIST_HEAD(&bm->blocks); INIT_LIST_HEAD(&bm->zones); error = create_mem_extents(&mem_extents, gfp_mask); @@ -627,38 +583,13 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) list_for_each_entry(ext, &mem_extents, hook) { struct mem_zone_bm_rtree *zone; - struct bm_block *bb; - unsigned long pfn = ext->start; - unsigned long pages = ext->end - ext->start; - - bb = list_entry(bm->blocks.prev, struct bm_block, hook); - - error = create_bm_block_list(pages, bm->blocks.prev, &ca); - if (error) - goto Error; - - list_for_each_entry_continue(bb, &bm->blocks, hook) { - bb->data = get_image_page(gfp_mask, safe_needed); - if (!bb->data) { - error = -ENOMEM; - goto Error; - } - - bb->start_pfn = pfn; - if (pages >= BM_BITS_PER_BLOCK) { - pfn += BM_BITS_PER_BLOCK; - pages -= BM_BITS_PER_BLOCK; - } else { - /* This is executed only once in the loop */ - pfn += pages; - } - bb->end_pfn = pfn; - } zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca, ext->start, ext->end); - if (!zone) + if (!zone) { + error = -ENOMEM; goto Error; + } list_add_tail(&zone->list, &bm->zones); } @@ -680,11 +611,6 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) { struct mem_zone_bm_rtree *zone; - struct bm_block *bb; - - list_for_each_entry(bb, &bm->blocks, hook) - if (bb->data) - free_image_page(bb->data, clear_nosave_free); list_for_each_entry(zone, &bm->zones, list) free_zone_bm_rtree(zone, clear_nosave_free); @@ -692,55 +618,20 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) free_list_of_pages(bm->p_list, clear_nosave_free); INIT_LIST_HEAD(&bm->zones); - INIT_LIST_HEAD(&bm->blocks); } /** - * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds - * to given pfn. The cur_zone_bm member of @bm and the cur_block member - * of @bm->cur_zone_bm are updated. - */ -static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, - void **addr, unsigned int *bit_nr) -{ - struct bm_block *bb; - - /* - * Check if the pfn corresponds to the current bitmap block and find - * the block where it fits if this is not the case. - */ - bb = bm->cur.block; - if (pfn < bb->start_pfn) - list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) - if (pfn >= bb->start_pfn) - break; - - if (pfn >= bb->end_pfn) - list_for_each_entry_continue(bb, &bm->blocks, hook) - if (pfn >= bb->start_pfn && pfn < bb->end_pfn) - break; - - if (&bb->hook == &bm->blocks) - return -EFAULT; - - /* The block has been found */ - bm->cur.block = bb; - pfn -= bb->start_pfn; - bm->cur.bit = pfn + 1; - *bit_nr = pfn; - *addr = bb->data; - return 0; -} - -/* - * memory_rtree_find_bit - Find the bit for pfn in the memory - * bitmap + * memory_bm_find_bit - Find the bit for pfn in the memory + * bitmap * - * Walks the radix tree to find the page which contains the bit for + * Find the bit in the bitmap @bm that corresponds to given pfn. + * The cur.zone, cur.block and cur.node_pfn member of @bm are + * updated. + * It walks the radix tree to find the page which contains the bit for * pfn and returns the bit position in **addr and *bit_nr. */ -static int memory_rtree_find_bit(struct memory_bitmap *bm, unsigned long pfn, - void **addr, unsigned int *bit_nr) +static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, + void **addr, unsigned int *bit_nr) { struct mem_zone_bm_rtree *curr, *zone; struct rtree_node *node; @@ -808,10 +699,6 @@ static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) error = memory_bm_find_bit(bm, pfn, &addr, &bit); BUG_ON(error); set_bit(bit, addr); - - error = memory_rtree_find_bit(bm, pfn, &addr, &bit); - BUG_ON(error); - set_bit(bit, addr); } static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) @@ -823,12 +710,6 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) error = memory_bm_find_bit(bm, pfn, &addr, &bit); if (!error) set_bit(bit, addr); - else - return error; - - error = memory_rtree_find_bit(bm, pfn, &addr, &bit); - if (!error) - set_bit(bit, addr); return error; } @@ -842,10 +723,6 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) error = memory_bm_find_bit(bm, pfn, &addr, &bit); BUG_ON(error); clear_bit(bit, addr); - - error = memory_rtree_find_bit(bm, pfn, &addr, &bit); - BUG_ON(error); - clear_bit(bit, addr); } static void memory_bm_clear_current(struct memory_bitmap *bm) @@ -854,82 +731,25 @@ static void memory_bm_clear_current(struct memory_bitmap *bm) bit = max(bm->cur.node_bit - 1, 0); clear_bit(bit, bm->cur.node->data); - - bit = max(bm->cur.bit - 1, 0); - clear_bit(bit, bm->cur.block->data); } static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; - int error, error2; - int v; + int error; error = memory_bm_find_bit(bm, pfn, &addr, &bit); BUG_ON(error); - v = test_bit(bit, addr); - - error2 = memory_rtree_find_bit(bm, pfn, &addr, &bit); - BUG_ON(error2); - - WARN_ON_ONCE(v != test_bit(bit, addr)); - - return v; + return test_bit(bit, addr); } static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; - int present; - - present = !memory_bm_find_bit(bm, pfn, &addr, &bit); - - WARN_ON_ONCE(present != !memory_rtree_find_bit(bm, pfn, &addr, &bit)); - return present; -} - -/** - * memory_bm_next_pfn - find the pfn that corresponds to the next set bit - * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is - * returned. - * - * It is required to run memory_bm_position_reset() before the first call to - * this function. - */ - -static unsigned long memory_bm_rtree_next_pfn(struct memory_bitmap *bm); - -static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) -{ - unsigned long rtree_pfn; - struct bm_block *bb; - int bit; - - rtree_pfn = memory_bm_rtree_next_pfn(bm); - - bb = bm->cur.block; - do { - bit = bm->cur.bit; - bit = find_next_bit(bb->data, bm_block_bits(bb), bit); - if (bit < bm_block_bits(bb)) - goto Return_pfn; - - bb = list_entry(bb->hook.next, struct bm_block, hook); - bm->cur.block = bb; - bm->cur.bit = 0; - } while (&bb->hook != &bm->blocks); - - memory_bm_position_reset(bm); - WARN_ON_ONCE(rtree_pfn != BM_END_OF_MAP); - return BM_END_OF_MAP; - - Return_pfn: - WARN_ON_ONCE(bb->start_pfn + bit != rtree_pfn); - bm->cur.bit = bit + 1; - return bb->start_pfn + bit; + return !memory_bm_find_bit(bm, pfn, &addr, &bit); } /* @@ -967,14 +787,17 @@ static bool rtree_next_node(struct memory_bitmap *bm) return false; } -/* - * memory_bm_rtree_next_pfn - Find the next set bit +/** + * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm * * Starting from the last returned position this function searches * for the next set bit in the memory bitmap and returns its * number. If no more bit is set BM_END_OF_MAP is returned. + * + * It is required to run memory_bm_position_reset() before the + * first call to this function. */ -static unsigned long memory_bm_rtree_next_pfn(struct memory_bitmap *bm) +static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { unsigned long bits, pfn, pages; int bit; @@ -1216,11 +1039,7 @@ void free_basic_memory_bitmaps(void) unsigned int snapshot_additional_pages(struct zone *zone) { unsigned int rtree, nodes; - unsigned int res; - res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - res += DIV_ROUND_UP(res * sizeof(struct bm_block), - LINKED_PAGE_DATA_SIZE); rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node), LINKED_PAGE_DATA_SIZE); @@ -1229,7 +1048,7 @@ unsigned int snapshot_additional_pages(struct zone *zone) rtree += nodes; } - return 2 * (res + rtree); + return 2 * rtree; } #ifdef CONFIG_HIGHMEM -- cgit v1.1 From 0f7d83e85dbd5bb8032ebed7713edf59670fb074 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 21 Jul 2014 12:27:02 +0200 Subject: PM / Hibernate: Touch Soft Lockup Watchdog in rtree_next_node When a memory bitmap is fully populated on a large memory machine (several TB of RAM) it can take more than a minute to walk through all bits. This causes the soft lockup detector on these machine to report warnings. Avoid this by touching the soft lockup watchdog in the memory bitmap walking code. Signed-off-by: Joerg Roedel Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ab1998a..4fc5c32 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -769,6 +769,7 @@ static bool rtree_next_node(struct memory_bitmap *bm) if (&bm->cur.node->list != &bm->cur.zone->leaves) { bm->cur.node_pfn += BM_BITS_PER_BLOCK; bm->cur.node_bit = 0; + touch_softlockup_watchdog(); return true; } -- cgit v1.1 From 728dba3a39c66b3d8ac889ddbe38b5b1c264aec3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 3 Feb 2014 19:13:49 -0800 Subject: namespaces: Use task_lock and not rcu to protect nsproxy The synchronous syncrhonize_rcu in switch_task_namespaces makes setns a sufficiently expensive system call that people have complained. Upon inspect nsproxy no longer needs rcu protection for remote reads. remote reads are rare. So optimize for same process reads and write by switching using rask_lock instead. This yields a simpler to understand lock, and a faster setns system call. In particular this fixes a performance regression observed by Rafael David Tinoco . This is effectively a revert of Pavel Emelyanov's commit cf7b708c8d1d7a27736771bcf4c457b332b0f818 Make access to task's nsproxy lighter from 2007. The race this originialy fixed no longer exists as do_notify_parent uses task_active_pid_ns(parent) instead of parent->nsproxy. Signed-off-by: "Eric W. Biederman" --- kernel/nsproxy.c | 15 ++++----------- kernel/utsname.c | 6 +++--- 2 files changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 8e78110..ef42d0a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) might_sleep(); + task_lock(p); ns = p->nsproxy; + p->nsproxy = new; + task_unlock(p); - rcu_assign_pointer(p->nsproxy, new); - - if (ns && atomic_dec_and_test(&ns->count)) { - /* - * wait for others to get what they want from this nsproxy. - * - * cannot release this nsproxy via the call_rcu() since - * put_mnt_ns() will want to sleep - */ - synchronize_rcu(); + if (ns && atomic_dec_and_test(&ns->count)) free_nsproxy(ns); - } } void exit_task_namespaces(struct task_struct *p) diff --git a/kernel/utsname.c b/kernel/utsname.c index fd39312..883aaaa 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task) struct uts_namespace *ns = NULL; struct nsproxy *nsproxy; - rcu_read_lock(); - nsproxy = task_nsproxy(task); + task_lock(task); + nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->uts_ns; get_uts_ns(ns); } - rcu_read_unlock(); + task_unlock(task); return ns; } -- cgit v1.1 From 21d1f908d39559b013ea857c1685253476b95a4a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 31 Jul 2014 20:57:10 +0200 Subject: Revert "PM / sleep / irq: Do not suspend wakeup interrupts" This reverts commit d709f7bcbb3ab01704fa7b37a2e4b981cf3783c1. Undo, because it might break exisiting functionality. Requested-by: Rafael J. Wysocki Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 27a1fe0..53e9448 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -385,8 +385,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) { if (suspend) { - if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND) || - irqd_has_set(&desc->irq_data, IRQD_WAKEUP_STATE)) + if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) return; desc->istate |= IRQS_SUSPENDED; } -- cgit v1.1 From c6f1224573c3b609bd8073b39f496637a16cc06f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 31 Jul 2014 20:58:28 +0200 Subject: Revert "irq: Warn when shared interrupts do not match on NO_SUSPEND" This reverts commit 4fae4e7624653ef498d0e2a38f00620b9701ab04. Undo because it breaks working systems. Requested-by: Rafael J. Wysocki Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 53e9448..3dc6a61 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1076,12 +1076,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * set the trigger type must match. Also all must * agree on ONESHOT. */ - -#define IRQF_MISMATCH \ - (IRQF_TRIGGER_MASK | IRQF_ONESHOT | IRQF_NO_SUSPEND) - if (!((old->flags & new->flags) & IRQF_SHARED) || - ((old->flags ^ new->flags) & IRQF_MISMATCH)) + ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || + ((old->flags ^ new->flags) & IRQF_ONESHOT)) goto mismatch; /* All handlers must agree on per-cpuness */ -- cgit v1.1 From 4df95ff488eb796aab9566652c250330179def17 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Jul 2014 20:34:14 -0700 Subject: net: filter: rename sk_chk_filter() -> bpf_check_classic() trivial rename to indicate that this functions performs classic BPF checking Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 2 +- kernel/seccomp.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 265a02c..b479807 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -18,7 +18,7 @@ * 2 of the License, or (at your option) any later version. * * Andi Kleen - Fix a few bad bugs and races. - * Kris Katterjohn - Added many additional checks in sk_chk_filter() + * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ #include #include diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 565743d..f4a77d2 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -87,7 +87,7 @@ static void populate_seccomp_data(struct seccomp_data *sd) * @filter: filter to verify * @flen: length of filter * - * Takes a previously checked filter (by sk_chk_filter) and + * Takes a previously checked filter (by bpf_check_classic) and * redirects all filter code that loads struct sk_buff data * and related data through seccomp_bpf_load. It also * enforces length and alignment checking of those loads. @@ -239,7 +239,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) goto free_prog; /* Check and rewrite the fprog via the skb checker */ - ret = sk_chk_filter(fp, fprog->len); + ret = bpf_check_classic(fp, fprog->len); if (ret) goto free_prog; -- cgit v1.1 From 8fb575ca396bc31d9fa99c26336e2432b41d1bfc Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Jul 2014 20:34:15 -0700 Subject: net: filter: rename sk_convert_filter() -> bpf_convert_filter() to indicate that this function is converting classic BPF into eBPF and not related to sockets Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 2 +- kernel/seccomp.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b479807..188ac5b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -446,7 +446,7 @@ load_word: /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are * only appearing in the programs where ctx == * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] - * == BPF_R6, sk_convert_filter() saves it in BPF_R6, + * == BPF_R6, bpf_convert_filter() saves it in BPF_R6, * internal BPF verifier will check that BPF_R6 == * ctx. * diff --git a/kernel/seccomp.c b/kernel/seccomp.c index f4a77d2..33a3a97 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -249,7 +249,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) goto free_prog; /* Convert 'sock_filter' insns to 'bpf_insn' insns */ - ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); + ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len); if (ret) goto free_prog; @@ -265,7 +265,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) if (!filter->prog) goto free_filter; - ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); + ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); if (ret) goto free_filter_prog; kfree(fp); -- cgit v1.1 From 7ae457c1e5b45a1b826fad9d62b32191d2bdcfdb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Jul 2014 20:34:16 -0700 Subject: net: filter: split 'struct sk_filter' into socket and bpf parts clean up names related to socket filtering and bpf in the following way: - everything that deals with sockets keeps 'sk_*' prefix - everything that is pure BPF is changed to 'bpf_*' prefix split 'struct sk_filter' into struct sk_filter { atomic_t refcnt; struct rcu_head rcu; struct bpf_prog *prog; }; and struct bpf_prog { u32 jited:1, len:31; struct sock_fprog_kern *orig_prog; unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); union { struct sock_filter insns[0]; struct bpf_insn insnsi[0]; struct work_struct work; }; }; so that 'struct bpf_prog' can be used independent of sockets and cleans up 'unattached' bpf use cases split SK_RUN_FILTER macro into: SK_RUN_FILTER to be used with 'struct sk_filter *' and BPF_PROG_RUN to be used with 'struct bpf_prog *' __sk_filter_release(struct sk_filter *) gains __bpf_prog_release(struct bpf_prog *) helper function also perform related renames for the functions that work with 'struct bpf_prog *', since they're on the same lines: sk_filter_size -> bpf_prog_size sk_filter_select_runtime -> bpf_prog_select_runtime sk_filter_free -> bpf_prog_free sk_unattached_filter_create -> bpf_prog_create sk_unattached_filter_destroy -> bpf_prog_destroy sk_store_orig_filter -> bpf_prog_store_orig_filter sk_release_orig_filter -> bpf_release_orig_filter __sk_migrate_filter -> bpf_migrate_filter __sk_prepare_filter -> bpf_prepare_filter API for attaching classic BPF to a socket stays the same: sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *) and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program which is used by sockets, tun, af_packet API for 'unattached' BPF programs becomes: bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *) and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 30 ++++++++++++++---------------- kernel/seccomp.c | 10 +++++----- 2 files changed, 19 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 188ac5b..7f0dbcb 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -73,15 +73,13 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } /** - * __sk_run_filter - run a filter on a given context - * @ctx: buffer to run the filter on - * @insn: filter to apply + * __bpf_prog_run - run eBPF program on a given context + * @ctx: is the data we are operating on + * @insn: is the array of eBPF instructions * - * Decode and apply filter instructions to the skb->data. Return length to - * keep, 0 for none. @ctx is the data we are operating on, @insn is the - * array of filter instructions. + * Decode and execute eBPF instructions. */ -static unsigned int __sk_run_filter(void *ctx, const struct bpf_insn *insn) +static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) { u64 stack[MAX_BPF_STACK / sizeof(u64)]; u64 regs[MAX_BPF_REG], tmp; @@ -508,29 +506,29 @@ load_byte: return 0; } -void __weak bpf_int_jit_compile(struct sk_filter *prog) +void __weak bpf_int_jit_compile(struct bpf_prog *prog) { } /** - * sk_filter_select_runtime - select execution runtime for BPF program - * @fp: sk_filter populated with internal BPF program + * bpf_prog_select_runtime - select execution runtime for BPF program + * @fp: bpf_prog populated with internal BPF program * * try to JIT internal BPF program, if JIT is not available select interpreter - * BPF program will be executed via SK_RUN_FILTER() macro + * BPF program will be executed via BPF_PROG_RUN() macro */ -void sk_filter_select_runtime(struct sk_filter *fp) +void bpf_prog_select_runtime(struct bpf_prog *fp) { - fp->bpf_func = (void *) __sk_run_filter; + fp->bpf_func = (void *) __bpf_prog_run; /* Probe if internal BPF can be JITed */ bpf_int_jit_compile(fp); } -EXPORT_SYMBOL_GPL(sk_filter_select_runtime); +EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); /* free internal BPF program */ -void sk_filter_free(struct sk_filter *fp) +void bpf_prog_free(struct bpf_prog *fp) { bpf_jit_free(fp); } -EXPORT_SYMBOL_GPL(sk_filter_free); +EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 33a3a97..2f3fa2c 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -54,7 +54,7 @@ struct seccomp_filter { atomic_t usage; struct seccomp_filter *prev; - struct sk_filter *prog; + struct bpf_prog *prog; }; /* Limit any path through the tree to 256KB worth of instructions. */ @@ -187,7 +187,7 @@ static u32 seccomp_run_filters(int syscall) * value always takes priority (ignoring the DATA). */ for (f = current->seccomp.filter; f; f = f->prev) { - u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd); + u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; @@ -260,7 +260,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) if (!filter) goto free_prog; - filter->prog = kzalloc(sk_filter_size(new_len), + filter->prog = kzalloc(bpf_prog_size(new_len), GFP_KERNEL|__GFP_NOWARN); if (!filter->prog) goto free_filter; @@ -273,7 +273,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) atomic_set(&filter->usage, 1); filter->prog->len = new_len; - sk_filter_select_runtime(filter->prog); + bpf_prog_select_runtime(filter->prog); /* * If there is an existing filter, make it the prev and don't drop its @@ -337,7 +337,7 @@ void put_seccomp_filter(struct task_struct *tsk) while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; orig = orig->prev; - sk_filter_free(freeme->prog); + bpf_prog_free(freeme->prog); kfree(freeme); } } -- cgit v1.1 From df5601f9c3d831b4c478b004a1ed90a18643adbe Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Mon, 7 Oct 2013 15:37:19 +0200 Subject: tracehook_signal_handler: Remove sig, info, ka and regs These parameters are nowhere used, so we can remove them. Signed-off-by: Richard Weinberger --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index a4077e9..c4d4766 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2379,7 +2379,7 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, if (!(ka->sa.sa_flags & SA_NODEFER)) sigaddset(&blocked, sig); set_current_blocked(&blocked); - tracehook_signal_handler(sig, info, ka, regs, stepping); + tracehook_signal_handler(stepping); } void signal_setup_done(int failed, struct ksignal *ksig, int stepping) -- cgit v1.1 From 10b1c7ac8bfed429cf3dcb0225482c8dc1485d8e Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sun, 13 Jul 2014 13:36:04 +0200 Subject: Clean up signal_delivered() - Pass a ksignal struct to it - Remove unused regs parameter - Make it private as it's nowhere outside of kernel/signal.c is used Signed-off-by: Richard Weinberger --- kernel/signal.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c4d4766..0d75cf8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2353,19 +2353,15 @@ relock: /** * signal_delivered - - * @sig: number of signal being delivered - * @info: siginfo_t of signal being delivered - * @ka: sigaction setting that chose the handler - * @regs: user register state + * @ksig: kernel signal struct * @stepping: nonzero if debugger single-step or block-step in use * * This function should be called when a signal has successfully been - * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask + * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask * is always blocked, and the signal itself is blocked unless %SA_NODEFER - * is set in @ka->sa.sa_flags. Tracing is notified. + * is set in @ksig->ka.sa.sa_flags. Tracing is notified. */ -void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, - struct pt_regs *regs, int stepping) +static void signal_delivered(struct ksignal *ksig, int stepping) { sigset_t blocked; @@ -2375,9 +2371,9 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, simply clear the restore sigmask flag. */ clear_restore_sigmask(); - sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(&blocked, sig); + sigorsets(&blocked, ¤t->blocked, &ksig->ka.sa.sa_mask); + if (!(ksig->ka.sa.sa_flags & SA_NODEFER)) + sigaddset(&blocked, ksig->sig); set_current_blocked(&blocked); tracehook_signal_handler(stepping); } @@ -2387,8 +2383,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping) if (failed) force_sigsegv(ksig->sig, current); else - signal_delivered(ksig->sig, &ksig->info, &ksig->ka, - signal_pt_regs(), stepping); + signal_delivered(ksig, stepping); } /* -- cgit v1.1 From 828b1f65d23cf8a68795739f6dd08fc8abd9ee64 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Mon, 7 Oct 2013 15:26:57 +0200 Subject: Rip out get_signal_to_deliver() Now we can turn get_signal() to the main function. Signed-off-by: Richard Weinberger --- kernel/signal.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 0d75cf8..5c60200 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2166,8 +2166,7 @@ static int ptrace_signal(int signr, siginfo_t *info) return signr; } -int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, - struct pt_regs *regs, void *cookie) +int get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; struct signal_struct *signal = current->signal; @@ -2237,13 +2236,13 @@ relock: goto relock; } - signr = dequeue_signal(current, ¤t->blocked, info); + signr = dequeue_signal(current, ¤t->blocked, &ksig->info); if (!signr) break; /* will return 0 */ if (unlikely(current->ptrace) && signr != SIGKILL) { - signr = ptrace_signal(signr, info); + signr = ptrace_signal(signr, &ksig->info); if (!signr) continue; } @@ -2251,13 +2250,13 @@ relock: ka = &sighand->action[signr-1]; /* Trace actually delivered signals. */ - trace_signal_deliver(signr, info, ka); + trace_signal_deliver(signr, &ksig->info, ka); if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { /* Run the handler. */ - *return_ka = *ka; + ksig->ka = *ka; if (ka->sa.sa_flags & SA_ONESHOT) ka->sa.sa_handler = SIG_DFL; @@ -2307,7 +2306,7 @@ relock: spin_lock_irq(&sighand->siglock); } - if (likely(do_signal_stop(info->si_signo))) { + if (likely(do_signal_stop(ksig->info.si_signo))) { /* It released the siglock. */ goto relock; } @@ -2328,7 +2327,7 @@ relock: if (sig_kernel_coredump(signr)) { if (print_fatal_signals) - print_fatal_signal(info->si_signo); + print_fatal_signal(ksig->info.si_signo); proc_coredump_connector(current); /* * If it was able to dump core, this kills all @@ -2338,17 +2337,19 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump(info); + do_coredump(&ksig->info); } /* * Death signals, no core dump. */ - do_group_exit(info->si_signo); + do_group_exit(ksig->info.si_signo); /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); - return signr; + + ksig->sig = signr; + return ksig->sig > 0; } /** -- cgit v1.1 From 372ba8cb46b271a7662b92cbefedee56725f6bd0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 14:19:21 +0100 Subject: cpuidle: menu: Lookup CPU runqueues less The menu governer makes separate lookups of the CPU runqueue to get load and number of IO waiters but it can be done with a single lookup. Signed-off-by: Mel Gorman Signed-off-by: Rafael J. Wysocki --- kernel/sched/core.c | 7 +++++++ kernel/sched/proc.c | 7 ------- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3bdf01b..863ef1d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2385,6 +2385,13 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&this->nr_iowait); } +void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) +{ + struct rq *this = this_rq(); + *nr_waiters = atomic_read(&this->nr_iowait); + *load = this->cpu_load[0]; +} + #ifdef CONFIG_SMP /* diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index 16f5a30..8ecd552 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -8,13 +8,6 @@ #include "sched.h" -unsigned long this_cpu_load(void) -{ - struct rq *this = this_rq(); - return this->cpu_load[0]; -} - - /* * Global load-average calculations * -- cgit v1.1 From 021de3d904b88b1771a3a2cfc5b75023c391e646 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 6 Aug 2014 15:36:31 -0400 Subject: ring-buffer: Up rb_iter_peek() loop count to 3 After writting a test to try to trigger the bug that caused the ring buffer iterator to become corrupted, I hit another bug: WARNING: CPU: 1 PID: 5281 at kernel/trace/ring_buffer.c:3766 rb_iter_peek+0x113/0x238() Modules linked in: ipt_MASQUERADE sunrpc [...] CPU: 1 PID: 5281 Comm: grep Tainted: G W 3.16.0-rc3-test+ #143 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007 0000000000000000 ffffffff81809a80 ffffffff81503fb0 0000000000000000 ffffffff81040ca1 ffff8800796d6010 ffffffff810c138d ffff8800796d6010 ffff880077438c80 ffff8800796d6010 ffff88007abbe600 0000000000000003 Call Trace: [] ? dump_stack+0x4a/0x75 [] ? warn_slowpath_common+0x7e/0x97 [] ? rb_iter_peek+0x113/0x238 [] ? rb_iter_peek+0x113/0x238 [] ? ring_buffer_iter_peek+0x2d/0x5c [] ? tracing_iter_reset+0x6e/0x96 [] ? s_start+0xd7/0x17b [] ? kmem_cache_alloc_trace+0xda/0xea [] ? seq_read+0x148/0x361 [] ? vfs_read+0x93/0xf1 [] ? SyS_read+0x60/0x8e [] ? tracesys+0xdd/0xe2 Debugging this bug, which triggers when the rb_iter_peek() loops too many times (more than 2 times), I discovered there's a case that can cause that function to legitimately loop 3 times! rb_iter_peek() is different than rb_buffer_peek() as the rb_buffer_peek() only deals with the reader page (it's for consuming reads). The rb_iter_peek() is for traversing the buffer without consuming it, and as such, it can loop for one more reason. That is, if we hit the end of the reader page or any page, it will go to the next page and try again. That is, we have this: 1. iter->head > iter->head_page->page->commit (rb_inc_iter() which moves the iter to the next page) try again 2. event = rb_iter_head_event() event->type_len == RINGBUF_TYPE_TIME_EXTEND rb_advance_iter() try again 3. read the event. But we never get to 3, because the count is greater than 2 and we cause the WARNING and return NULL. Up the counter to 3. Cc: stable@vger.kernel.org # 2.6.37+ Fixes: 69d1b839f7ee "ring-buffer: Bind time extend and data events together" Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ff70271..31a9edd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1984,7 +1984,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) /** * rb_update_event - update event type and data - * @event: the even to update + * @event: the event to update * @type: the type of event * @length: the size of the event field in the ring buffer * @@ -3764,12 +3764,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) return NULL; /* - * We repeat when a time extend is encountered. - * Since the time extend is always attached to a data event, - * we should never loop more than once. - * (We never hit the following condition more than twice). + * We repeat when a time extend is encountered or we hit + * the end of the page. Since the time extend is always attached + * to a data event, we should never loop more than three times. + * Once for going to next page, once on time extend, and + * finally once to get the event. + * (We never hit the following condition more than thrice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) -- cgit v1.1 From 651e22f2701b4113989237c3048d17337dd2185c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 6 Aug 2014 14:11:33 -0400 Subject: ring-buffer: Always reset iterator to reader page When performing a consuming read, the ring buffer swaps out a page from the ring buffer with a empty page and this page that was swapped out becomes the new reader page. The reader page is owned by the reader and since it was swapped out of the ring buffer, writers do not have access to it (there's an exception to that rule, but it's out of scope for this commit). When reading the "trace" file, it is a non consuming read, which means that the data in the ring buffer will not be modified. When the trace file is opened, a ring buffer iterator is allocated and writes to the ring buffer are disabled, such that the iterator will not have issues iterating over the data. Although the ring buffer disabled writes, it does not disable other reads, or even consuming reads. If a consuming read happens, then the iterator is reset and starts reading from the beginning again. My tests would sometimes trigger this bug on my i386 box: WARNING: CPU: 0 PID: 5175 at kernel/trace/trace.c:1527 __trace_find_cmdline+0x66/0xaa() Modules linked in: CPU: 0 PID: 5175 Comm: grep Not tainted 3.16.0-rc3-test+ #8 Hardware name: /DG965MQ, BIOS MQ96510J.86A.0372.2006.0605.1717 06/05/2006 00000000 00000000 f09c9e1c c18796b3 c1b5d74c f09c9e4c c103a0e3 c1b5154b f09c9e78 00001437 c1b5d74c 000005f7 c10bd85a c10bd85a c1cac57c f09c9eb0 ed0e0000 f09c9e64 c103a185 00000009 f09c9e5c c1b5154b f09c9e78 f09c9e80^M Call Trace: [] dump_stack+0x4b/0x75 [] warn_slowpath_common+0x7e/0x95 [] ? __trace_find_cmdline+0x66/0xaa [] ? __trace_find_cmdline+0x66/0xaa [] warn_slowpath_fmt+0x33/0x35 [] __trace_find_cmdline+0x66/0xaa^M [] trace_find_cmdline+0x40/0x64 [] trace_print_context+0x27/0xec [] ? trace_seq_printf+0x37/0x5b [] print_trace_line+0x319/0x39b [] ? ring_buffer_read+0x47/0x50 [] s_show+0x192/0x1ab [] ? s_next+0x5a/0x7c [] seq_read+0x267/0x34c [] vfs_read+0x8c/0xef [] ? seq_lseek+0x154/0x154 [] SyS_read+0x54/0x7f [] syscall_call+0x7/0xb ---[ end trace 3f507febd6b4cc83 ]--- >>>> ##### CPU 1 buffer started #### Which was the __trace_find_cmdline() function complaining about the pid in the event record being negative. After adding more test cases, this would trigger more often. Strangely enough, it would never trigger on a single test, but instead would trigger only when running all the tests. I believe that was the case because it required one of the tests to be shutting down via delayed instances while a new test started up. After spending several days debugging this, I found that it was caused by the iterator becoming corrupted. Debugging further, I found out why the iterator became corrupted. It happened with the rb_iter_reset(). As consuming reads may not read the full reader page, and only part of it, there's a "read" field to know where the last read took place. The iterator, must also start at the read position. In the rb_iter_reset() code, if the reader page was disconnected from the ring buffer, the iterator would start at the head page within the ring buffer (where writes still happen). But the mistake there was that it still used the "read" field to start the iterator on the head page, where it should always start at zero because readers never read from within the ring buffer where writes occur. I originally wrote a patch to have it set the iter->head to 0 instead of iter->head_page->read, but then I questioned why it wasn't always setting the iter to point to the reader page, as the reader page is still valid. The list_empty(reader_page->list) just means that it was successful in swapping out. But the reader_page may still have data. There was a bug report a long time ago that was not reproducible that had something about trace_pipe (consuming read) not matching trace (iterator read). This may explain why that happened. Anyway, the correct answer to this bug is to always use the reader page an not reset the iterator to inside the writable ring buffer. Cc: stable@vger.kernel.org # 2.6.28+ Fixes: d769041f8653 "ring_buffer: implement new locking" Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 31a9edd..b95381e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3357,21 +3357,16 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* Iterator usage is expected to have record disabled */ - if (list_empty(&cpu_buffer->reader_page->list)) { - iter->head_page = rb_set_head_page(cpu_buffer); - if (unlikely(!iter->head_page)) - return; - iter->head = iter->head_page->read; - } else { - iter->head_page = cpu_buffer->reader_page; - iter->head = cpu_buffer->reader_page->read; - } + iter->head_page = cpu_buffer->reader_page; + iter->head = cpu_buffer->reader_page->read; + + iter->cache_reader_page = iter->head_page; + iter->cache_read = iter->head; + if (iter->head) iter->read_stamp = cpu_buffer->read_stamp; else iter->read_stamp = iter->head_page->page->time_stamp; - iter->cache_reader_page = cpu_buffer->reader_page; - iter->cache_read = cpu_buffer->read; } /** -- cgit v1.1 From 84c91b7ae07c62cf6dee7fde3277f4be21331f85 Mon Sep 17 00:00:00 2001 From: "Lee, Chun-Yi" Date: Mon, 4 Aug 2014 23:23:21 +0800 Subject: PM / hibernate: avoid unsafe pages in e820 reserved regions When the machine doesn't well handle the e820 persistent when hibernate resuming, then it may cause page fault when writing image to snapshot buffer: [ 17.929495] BUG: unable to handle kernel paging request at ffff880069d4f000 [ 17.933469] IP: [] load_image_lzo+0x810/0xe40 [ 17.933469] PGD 2194067 PUD 77ffff067 PMD 2197067 PTE 0 [ 17.933469] Oops: 0002 [#1] SMP ... The ffff880069d4f000 page is in e820 reserved region of resume boot kernel: [ 0.000000] BIOS-e820: [mem 0x0000000069d4f000-0x0000000069e12fff] reserved ... [ 0.000000] PM: Registered nosave memory: [mem 0x69d4f000-0x69e12fff] So snapshot.c mark the pfn to forbidden pages map. But, this page is also in the memory bitmap in snapshot image because it's an original page used by image kernel, so it will also mark as an unsafe(free) page in prepare_image(). That means the page in e820 when resuming mark as "forbidden" and "free", it causes get_buffer() treat it as an allocated unsafe page. Then snapshot_write_next() return this page to load_image, load_image writing content to this address, but this page didn't really allocated . So, we got page fault. Although the root cause is from BIOS, I think aggressive check and significant message in kernel will better then a page fault for issue tracking, especially when serial console unavailable. This patch adds code in mark_unsafe_pages() for check does free pages in nosave region. If so, then it print message and return fault to stop whole S4 resume process: [ 8.166004] PM: Image loading progress: 0% [ 8.658717] PM: 0x6796c000 in e820 nosave region: [mem 0x6796c000-0x6796cfff] [ 8.918737] PM: Read 2511940 kbytes in 1.04 seconds (2415.32 MB/s) [ 8.926633] PM: Error -14 resuming [ 8.933534] PM: Failed to load hibernation image, recovering. Reviewed-by: Takashi Iwai Acked-by: Pavel Machek Signed-off-by: Lee, Chun-Yi [rjw: Subject] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 4fc5c32..c4b8093 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -954,6 +954,25 @@ static void mark_nosave_pages(struct memory_bitmap *bm) } } +static bool is_nosave_page(unsigned long pfn) +{ + struct nosave_region *region; + + list_for_each_entry(region, &nosave_regions, list) { + if (pfn >= region->start_pfn && pfn < region->end_pfn) { + pr_err("PM: %#010llx in e820 nosave region: " + "[mem %#010llx-%#010llx]\n", + (unsigned long long) pfn << PAGE_SHIFT, + (unsigned long long) region->start_pfn << PAGE_SHIFT, + ((unsigned long long) region->end_pfn << PAGE_SHIFT) + - 1); + return true; + } + } + + return false; +} + /** * create_basic_memory_bitmaps - create bitmaps needed for marking page * frames that should not be saved and free page frames. The pointers @@ -2015,7 +2034,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) do { pfn = memory_bm_next_pfn(bm); if (likely(pfn != BM_END_OF_MAP)) { - if (likely(pfn_valid(pfn))) + if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn)) swsusp_set_page_free(pfn_to_page(pfn)); else return -EFAULT; -- cgit v1.1 From bab5e2d6522bc3cb892c1e8aaafecab05bed9d85 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:03:22 -0700 Subject: kernel/auditfilter.c: replace count*size kmalloc by kcalloc kcalloc manages count*sizeof overflow. Signed-off-by: Fabian Frederick Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8e9bc9c..c447cd9 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -106,7 +106,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count) if (unlikely(!entry)) return NULL; - fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); + fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL); if (unlikely(!fields)) { kfree(entry); return NULL; @@ -160,7 +160,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES]; int __init audit_register_class(int class, unsigned *list) { - __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); + __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL); if (!p) return -ENOMEM; while (*list != ~0U) { -- cgit v1.1 From 656c3b79f782a235413087168b61ff279034d860 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:04:03 -0700 Subject: kernel/watchdog.c: convert printk/pr_warning to pr_foo() Replace some obsolete functions. Signed-off-by: Fabian Frederick Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c3319bd..51b29e9 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -260,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event, return; if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + panic("Watchdog detected hard LOCKUP on cpu %d", + this_cpu); else - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", + this_cpu); __this_cpu_write(hard_watchdog_warn, true); return; @@ -345,7 +347,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) } } - printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); print_modules(); @@ -484,7 +486,7 @@ static int watchdog_nmi_enable(unsigned int cpu) if (PTR_ERR(event) == -EOPNOTSUPP) pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); else if (PTR_ERR(event) == -ENOENT) - pr_warning("disabled (cpu%i): hardware events not enabled\n", + pr_warn("disabled (cpu%i): hardware events not enabled\n", cpu); else pr_err("disabled (cpu%i): unable to create perf event: %ld\n", -- cgit v1.1 From ed4d4902ebdd7ca8b5a51daaf6bebf4b172895cc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:06:54 -0700 Subject: mm, hugetlb: remove hugetlb_zero and hugetlb_infinity They are unnecessary: "zero" can be used in place of "hugetlb_zero" and passing extra2 == NULL is equivalent to infinity. Signed-off-by: David Rientjes Cc: Joonsoo Kim Reviewed-by: Naoya Horiguchi Reviewed-by: Luiz Capitulino Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75b22e2..75875a7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1240,8 +1240,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #ifdef CONFIG_NUMA { @@ -1250,8 +1249,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #endif { @@ -1274,8 +1272,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_overcommit_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #endif { -- cgit v1.1 From fb794bcbb4e5552242f9a4c5e1ffe4c6da29a968 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:07:58 -0700 Subject: mm, oom: remove unnecessary exit_state check The oom killer scans each process and determines whether it is eligible for oom kill or whether the oom killer should abort because of concurrent memory freeing. It will abort when an eligible process is found to have TIF_MEMDIE set, meaning it has already been oom killed and we're waiting for it to exit. Processes with task->mm == NULL should not be considered because they are either kthreads or have already detached their memory and killing them would not lead to memory freeing. That memory is only freed after exit_mm() has returned, however, and not when task->mm is first set to NULL. Clear TIF_MEMDIE after exit_mm()'s mmput() so that an oom killed process is no longer considered for oom kill, but only until exit_mm() has returned. This was fragile in the past because it relied on exit_notify() to be reached before no longer considering TIF_MEMDIE processes. Signed-off-by: David Rientjes Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index e5c4668..88c6b3e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -455,6 +455,7 @@ static void exit_mm(struct task_struct * tsk) task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); + clear_thread_flag(TIF_MEMDIE); } /* -- cgit v1.1 From 618fde872163e782183ce574c77f1123e2be8887 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 6 Aug 2014 16:08:14 -0700 Subject: kernel/smp.c:on_each_cpu_cond(): fix warning in fallback path The rarely-executed memry-allocation-failed callback path generates a WARN_ON_ONCE() when smp_call_function_single() succeeds. Presumably it's supposed to warn on failures. Signed-off-by: Sasha Levin Cc: Christoph Lameter Cc: Gilad Ben-Yossef Cc: David Rientjes Cc: Joonsoo Kim Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 487653b..aff8aa1 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -670,7 +670,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), if (cond_func(cpu, info)) { ret = smp_call_function_single(cpu, func, info, wait); - WARN_ON_ONCE(!ret); + WARN_ON_ONCE(ret); } preempt_enable(); } -- cgit v1.1 From 7030017752437cebc3ec5590735bd89ead1e4cb8 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 6 Aug 2014 16:08:49 -0700 Subject: printk: make dynamic kernel ring buffer alignment explicit We have to consider alignment for the ring buffer both for the default static size, and then also for when an dynamic allocation is made when the log_buf_len=n kernel parameter is passed to set the size specifically to a size larger than the default size set by the architecture through CONFIG_LOG_BUF_SHIFT. The default static kernel ring buffer can be aligned properly if architectures set CONFIG_LOG_BUF_SHIFT properly, we provide ranges for the size though so even if CONFIG_LOG_BUF_SHIFT has a sensible aligned value it can be reduced to a non aligned value. Commit 6ebb017de9 ("printk: Fix alignment of buf causing crash on ARM EABI") by Andrew Lunn ensures the static buffer is always aligned and the decision of alignment is done by the compiler by using __alignof__(struct log). When log_buf_len=n is used we allocate the ring buffer dynamically. Dynamic allocation varies, for the early allocation called before setup_arch() memblock_virt_alloc() requests a page aligment and for the default kernel allocation memblock_virt_alloc_nopanic() requests no special alignment, which in turn ends up aligning the allocation to SMP_CACHE_BYTES, which is L1 cache aligned. Since we already have the required alignment for the kernel ring buffer though we can do better and request explicit alignment for LOG_ALIGN. This does that to be safe and make dynamic allocation alignment explicit. Signed-off-by: Luis R. Rodriguez Tested-by: Petr Mladek Acked-by: Petr Mladek Cc: Andrew Lunn Cc: Stephen Warren Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Petr Mladek Cc: Joe Perches Cc: Arun KS Cc: Kees Cook Cc: Davidlohr Bueso Cc: Chris Metcalf Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 13e839d..6f598f9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -853,9 +853,10 @@ void __init setup_log_buf(int early) if (early) { new_log_buf = - memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); + memblock_virt_alloc(new_log_buf_len, LOG_ALIGN); } else { - new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); + new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, + LOG_ALIGN); } if (unlikely(!new_log_buf)) { -- cgit v1.1 From c0a318a361e7652b8c4f7b91d3a31c771cf34e4f Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 6 Aug 2014 16:08:52 -0700 Subject: printk: move power of 2 practice of ring buffer size to a helper In practice the power of 2 practice of the size of the kernel ring buffer remains purely historical but not a requirement, specially now that we have LOG_ALIGN and use it for both static and dynamic allocations. It could have helped with implicit alignment back in the days given the even the dynamically sized ring buffer was guaranteed to be aligned so long as CONFIG_LOG_BUF_SHIFT was set to produce a __LOG_BUF_LEN which is architecture aligned, since log_buf_len=n would be allowed only if it was > __LOG_BUF_LEN and we always ended up rounding the log_buf_len=n to the next power of 2 with roundup_pow_of_two(), any multiple of 2 then should be also architecture aligned. These assumptions of course relied heavily on CONFIG_LOG_BUF_SHIFT producing an aligned value but users can always change this. We now have precise alignment requirements set for the log buffer size for both static and dynamic allocations, but lets upkeep the old practice of using powers of 2 for its size to help with easy expected scalable values and the allocators for dynamic allocations. We'll reuse this later so move this into a helper. Signed-off-by: Luis R. Rodriguez Cc: Andrew Lunn Cc: Stephen Warren Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Petr Mladek Cc: Joe Perches Cc: Arun KS Cc: Kees Cook Cc: Davidlohr Bueso Cc: Chris Metcalf Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 6f598f9..32ad0c7 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -828,15 +828,21 @@ void log_buf_kexec_setup(void) /* requested log_buf_len from kernel cmdline */ static unsigned long __initdata new_log_buf_len; -/* save requested log_buf_len since it's too early to process it */ -static int __init log_buf_len_setup(char *str) +/* we practice scaling the ring buffer by powers of 2 */ +static void __init log_buf_len_update(unsigned size) { - unsigned size = memparse(str, &str); - if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) new_log_buf_len = size; +} + +/* save requested log_buf_len since it's too early to process it */ +static int __init log_buf_len_setup(char *str) +{ + unsigned size = memparse(str, &str); + + log_buf_len_update(size); return 0; } -- cgit v1.1 From f54051722e5715d24cd4469606ebdf488b6d5779 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 6 Aug 2014 16:08:54 -0700 Subject: printk: make dynamic units clear for the kernel ring buffer Signed-off-by: Luis R. Rodriguez Suggested-by: Davidlohr Bueso Cc: Andrew Lunn Cc: Stephen Warren Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Petr Mladek Cc: Joe Perches Cc: Arun KS Cc: Kees Cook Cc: Davidlohr Bueso Cc: Chris Metcalf Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 32ad0c7..db290be 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -879,7 +879,7 @@ void __init setup_log_buf(int early) memcpy(log_buf, __log_buf, __LOG_BUF_LEN); raw_spin_unlock_irqrestore(&logbuf_lock, flags); - pr_info("log_buf_len: %d\n", log_buf_len); + pr_info("log_buf_len: %d bytes\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", free, (free * 100) / __LOG_BUF_LEN); } -- cgit v1.1 From 23b2899f7f194f06e09b52a1f46f027a21fae17c Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 6 Aug 2014 16:08:56 -0700 Subject: printk: allow increasing the ring buffer depending on the number of CPUs The default size of the ring buffer is too small for machines with a large amount of CPUs under heavy load. What ends up happening when debugging is the ring buffer overlaps and chews up old messages making debugging impossible unless the size is passed as a kernel parameter. An idle system upon boot up will on average spew out only about one or two extra lines but where this really matters is on heavy load and that will vary widely depending on the system and environment. There are mechanisms to help increase the kernel ring buffer for tracing through debugfs, and those interfaces even allow growing the kernel ring buffer per CPU. We also have a static value which can be passed upon boot. Relying on debugfs however is not ideal for production, and relying on the value passed upon bootup is can only used *after* an issue has creeped up. Instead of being reactive this adds a proactive measure which lets you scale the amount of contributions you'd expect to the kernel ring buffer under load by each CPU in the worst case scenario. We use num_possible_cpus() to avoid complexities which could be introduced by dynamically changing the ring buffer size at run time, num_possible_cpus() lets us use the upper limit on possible number of CPUs therefore avoiding having to deal with hotplugging CPUs on and off. This introduces the kernel configuration option LOG_CPU_MAX_BUF_SHIFT which is used to specify the maximum amount of contributions to the kernel ring buffer in the worst case before the kernel ring buffer flips over, the size is specified as a power of 2. The total amount of contributions made by each CPU must be greater than half of the default kernel ring buffer size (1 << LOG_BUF_SHIFT bytes) in order to trigger an increase upon bootup. The kernel ring buffer is increased to the next power of two that would fit the required minimum kernel ring buffer size plus the additional CPU contribution. For example if LOG_BUF_SHIFT is 18 (256 KB) you'd require at least 128 KB contributions by other CPUs in order to trigger an increase of the kernel ring buffer. With a LOG_CPU_BUF_SHIFT of 12 (4 KB) you'd require at least anything over > 64 possible CPUs to trigger an increase. If you had 128 possible CPUs the amount of minimum required kernel ring buffer bumps to: ((1 << 18) + ((128 - 1) * (1 << 12))) / 1024 = 764 KB Since we require the ring buffer to be a power of two the new required size would be 1024 KB. This CPU contributions are ignored when the "log_buf_len" kernel parameter is used as it forces the exact size of the ring buffer to an expected power of two value. [pmladek@suse.cz: fix build] Signed-off-by: Luis R. Rodriguez Signed-off-by: Petr Mladek Tested-by: Davidlohr Bueso Tested-by: Petr Mladek Reviewed-by: Davidlohr Bueso Cc: Andrew Lunn Cc: Stephen Warren Cc: Michal Hocko Cc: Petr Mladek Cc: Joe Perches Cc: Arun KS Cc: Kees Cook Cc: Davidlohr Bueso Cc: Chris Metcalf Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index db290be..f855ec3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -266,6 +266,7 @@ static u32 clear_idx; #define LOG_ALIGN __alignof__(struct printk_log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; @@ -848,12 +849,45 @@ static int __init log_buf_len_setup(char *str) } early_param("log_buf_len", log_buf_len_setup); +static void __init log_buf_add_cpu(void) +{ + unsigned int cpu_extra; + + /* + * archs should set up cpu_possible_bits properly with + * set_cpu_possible() after setup_arch() but just in + * case lets ensure this is valid. + */ + if (num_possible_cpus() == 1) + return; + + cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; + + /* by default this will only continue through for large > 64 CPUs */ + if (cpu_extra <= __LOG_BUF_LEN / 2) + return; + + pr_info("log_buf_len individual max cpu contribution: %d bytes\n", + __LOG_CPU_MAX_BUF_LEN); + pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", + cpu_extra); + pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); + + log_buf_len_update(cpu_extra + __LOG_BUF_LEN); +} + void __init setup_log_buf(int early) { unsigned long flags; char *new_log_buf; int free; + if (log_buf != __log_buf) + return; + + if (!early && !new_log_buf_len) + log_buf_add_cpu(); + if (!new_log_buf_len) return; -- cgit v1.1 From e97e1267e9faa6480898a1fc34c8e40d74d702f2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Aug 2014 16:08:59 -0700 Subject: printk: tweak do_syslog() to match comments In do_syslog() there's a path used by kmsg_poll() and kmsg_read() that only needs to know whether there's any data available to read (and not its size). These callers only check for non-zero return. As a shortcut, do_syslog() returns the difference between what has been logged and what has been "seen." The comments say that the "count of records" should be returned but it's not. Instead it returns (log_next_idx - syslog_idx), which is a difference between buffer offsets--and the result could be negative. The behavior is the same (it'll be zero or not in the same cases), but the count of records is more meaningful and it matches what the comments say. So change the code to return that. Signed-off-by: Alex Elder Cc: Petr Mladek Cc: Jan Kara Cc: Joe Perches Cc: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f855ec3..ec3bfb0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1351,7 +1351,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) * for pending data, not the size; return the count of * records, not the length. */ - error = log_next_idx - syslog_idx; + error = log_next_seq - syslog_seq; } else { u64 seq = syslog_seq; u32 idx = syslog_idx; -- cgit v1.1 From 42a9dc0b3d0f749375c767c7d5cab56e89160576 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Aug 2014 16:09:01 -0700 Subject: printk: rename DEFAULT_MESSAGE_LOGLEVEL Commit a8fe19ebfbfd ("kernel/printk: use symbolic defines for console loglevels") makes consistent use of symbolic values for printk() log levels. The naming scheme used is different from the one used for DEFAULT_MESSAGE_LOGLEVEL though. Change that symbol name to be MESSAGE_LOGLEVEL_DEFAULT for consistency. And because the value of that symbol comes from a similarly-named config option, rename CONFIG_DEFAULT_MESSAGE_LOGLEVEL as well. Signed-off-by: Alex Elder Cc: Andi Kleen Cc: Borislav Petkov Cc: Jan Kara Cc: John Stultz Cc: Petr Mladek Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ec3bfb0..770ed48 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -56,7 +56,7 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ - DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ + MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; -- cgit v1.1 From 0b90fec3b990b50d77944bc73c1ba4b031dfa52f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Aug 2014 16:09:03 -0700 Subject: printk: fix some comments Fix a few comments that don't accurately describe their corresponding code. It also fixes some minor typographical errors. Signed-off-by: Alex Elder Reviewed-by: Petr Mladek Cc: Andi Kleen Cc: Borislav Petkov Cc: Jan Kara Cc: John Stultz Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 770ed48..4bae344 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -113,9 +113,9 @@ static int __down_trylock_console_sem(unsigned long ip) * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ - * hold it are racing, but it helps tracking those weird code - * path in the console code where we end up in places I want - * locked without the console sempahore held + * hold it and are racing, but it helps tracking those weird code + * paths in the console code where we end up in places I want + * locked without the console sempahore held). */ static int console_locked, console_suspended; @@ -146,8 +146,8 @@ static int console_may_schedule; * the overall length of the record. * * The heads to the first and last entry in the buffer, as well as the - * sequence numbers of these both entries are maintained when messages - * are stored.. + * sequence numbers of these entries are maintained when messages are + * stored. * * If the heads indicate available messages, the length in the header * tells the start next message. A length == 0 for the next message @@ -345,7 +345,7 @@ static int log_make_free_space(u32 msg_size) while (log_first_seq < log_next_seq) { if (logbuf_has_space(msg_size, false)) return 0; - /* drop old messages until we have enough continuous space */ + /* drop old messages until we have enough contiguous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } @@ -1517,7 +1517,7 @@ static struct cont { struct task_struct *owner; /* task of first print*/ u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ - u8 facility; /* log level of first message */ + u8 facility; /* log facility of first message */ enum log_flags flags; /* prefix, newline flags */ bool flushed:1; /* buffer sealed and committed */ } cont; @@ -1922,11 +1922,12 @@ static int __add_preferred_console(char *name, int idx, char *options, return 0; } /* - * Set up a list of consoles. Called from init/main.c + * Set up a console. Called via do_early_param() in init/main.c + * for each "console=" parameter in the boot command line. */ static int __init console_setup(char *str) { - char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ + char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */ char *s, *options, *brl_options = NULL; int idx; @@ -2086,8 +2087,8 @@ EXPORT_SYMBOL(console_lock); /** * console_trylock - try to lock the console system for exclusive use. * - * Tried to acquire a lock which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. + * Try to acquire a lock which guarantees that the caller has exclusive + * access to the console system and the console_drivers list. * * returns 1 on success, and 0 on failure to acquire the lock. */ -- cgit v1.1 From e99aa461660a6413b11da887fb499e04a0f46803 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Aug 2014 16:09:05 -0700 Subject: printk: use a clever macro Use the IS_ENABLED() macro rather than #ifdef blocks to set certain global values. Signed-off-by: Alex Elder Acked-by: Borislav Petkov Reviewed-by: Petr Mladek Cc: Andi Kleen Cc: Jan Kara Cc: John Stultz Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4bae344..ac86838 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -454,11 +454,7 @@ static int log_store(int facility, int level, return msg->text_len; } -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif +int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) { @@ -988,11 +984,7 @@ static inline void boot_delay_msec(int level) } #endif -#if defined(CONFIG_PRINTK_TIME) -static bool printk_time = 1; -#else -static bool printk_time; -#endif +static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_time(u64 ts, char *buf) -- cgit v1.1 From 249771b8307e7a91659d8b273f8b70d48c3a7bfc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Aug 2014 16:09:08 -0700 Subject: printk: miscellaneous cleanups Some small cleanups to kernel/printk/printk.c. None of them should cause any change in behavior. - When CONFIG_PRINTK is defined, parenthesize the value of LOG_LINE_MAX. - When CONFIG_PRINTK is *not* defined, there is an extra LOG_LINE_MAX definition; delete it. - Pull an assignment out of a conditional expression in console_setup(). - Use isdigit() in console_setup() rather than open coding it. - In update_console_cmdline(), drop a NUL-termination assignment; the strlcpy() call that precedes it guarantees it's not needed. - Simplify some logic in printk_timed_ratelimit(). Signed-off-by: Alex Elder Reviewed-by: Petr Mladek Cc: Andi Kleen Cc: Borislav Petkov Cc: Jan Kara Cc: John Stultz Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ac86838..5eb0e6c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -45,6 +45,7 @@ #include #include #include +#include #include @@ -257,7 +258,7 @@ static u64 clear_seq; static u32 clear_idx; #define PREFIX_MAX 32 -#define LOG_LINE_MAX 1024 - PREFIX_MAX +#define LOG_LINE_MAX (1024 - PREFIX_MAX) /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) @@ -1835,7 +1836,7 @@ EXPORT_SYMBOL(printk); #define LOG_LINE_MAX 0 #define PREFIX_MAX 0 -#define LOG_LINE_MAX 0 + static u64 syslog_seq; static u32 syslog_idx; static u64 console_seq; @@ -1936,7 +1937,8 @@ static int __init console_setup(char *str) strncpy(buf, str, sizeof(buf) - 1); } buf[sizeof(buf) - 1] = 0; - if ((options = strchr(str, ',')) != NULL) + options = strchr(str, ','); + if (options) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) @@ -1945,7 +1947,7 @@ static int __init console_setup(char *str) strcpy(buf, "ttyS1"); #endif for (s = buf; *s; s++) - if ((*s >= '0' && *s <= '9') || *s == ',') + if (isdigit(*s) || *s == ',') break; idx = simple_strtoul(s, NULL, 10); *s = 0; @@ -1984,7 +1986,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha i++, c++) if (strcmp(c->name, name) == 0 && c->index == idx) { strlcpy(c->name, name_new, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; c->options = options; c->index = idx_new; return i; @@ -2652,14 +2653,13 @@ EXPORT_SYMBOL(__printk_ratelimit); bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { - if (*caller_jiffies == 0 - || !time_in_range(jiffies, *caller_jiffies, - *caller_jiffies - + msecs_to_jiffies(interval_msecs))) { - *caller_jiffies = jiffies; - return true; - } - return false; + unsigned long elapsed = jiffies - *caller_jiffies; + + if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) + return false; + + *caller_jiffies = jiffies; + return true; } EXPORT_SYMBOL(printk_timed_ratelimit); -- cgit v1.1 From 5874af2003b1aaaa053128d655710140e3187226 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Aug 2014 16:09:10 -0700 Subject: printk: enable interrupts before calling console_trylock_for_printk() We need interrupts disabled when calling console_trylock_for_printk() only so that cpu id we pass to can_use_console() remains valid (for other things console_sem provides all the exclusion we need and deadlocks on console_sem due to interrupts are impossible because we use down_trylock()). However if we are rescheduled, we are guaranteed to run on an online cpu so we can easily just get the cpu id in can_use_console(). We can lose a bit of performance when we enable interrupts in vprintk_emit() and then disable them again in console_unlock() but OTOH it can somewhat reduce interrupt latency caused by console_unlock(). We differ from (reverted) commit 939f04bec1a4 in that we avoid calling console_unlock() from vprintk_emit() with lockdep enabled as that has unveiled quite some bugs leading to system freezes during boot (e.g. https://lkml.org/lkml/2014/5/30/242, https://lkml.org/lkml/2014/6/28/521). Signed-off-by: Jan Kara Tested-by: Andreas Bombe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5eb0e6c..df202fe 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1450,10 +1450,9 @@ static int have_callable_console(void) /* * Can we actually use the console at this time on this cpu? * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. */ static inline int can_use_console(unsigned int cpu) { @@ -1466,8 +1465,10 @@ static inline int can_use_console(unsigned int cpu) * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. */ -static int console_trylock_for_printk(unsigned int cpu) +static int console_trylock_for_printk(void) { + unsigned int cpu = smp_processor_id(); + if (!console_trylock()) return 0; /* @@ -1642,7 +1643,8 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; - goto out_restore_irqs; + local_irq_restore(flags); + return 0; } zap_locks(); } @@ -1750,21 +1752,30 @@ asmlinkage int vprintk_emit(int facility, int level, logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); + lockdep_on(); + local_irq_restore(flags); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { + lockdep_off(); + /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to + * console + */ + preempt_disable(); + /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock_for_printk(this_cpu)) + if (console_trylock_for_printk()) console_unlock(); + preempt_enable(); + lockdep_on(); } - lockdep_on(); -out_restore_irqs: - local_irq_restore(flags); return printed_len; } EXPORT_SYMBOL(vprintk_emit); -- cgit v1.1 From d25d9feced6c94398979a035868f03e8e8d49ce8 Mon Sep 17 00:00:00 2001 From: Neil Zhang Date: Wed, 6 Aug 2014 16:09:12 -0700 Subject: kernel/printk/printk.c: fix bool assignements Fix coccinelle warnings. Signed-off-by: Neil Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index df202fe..de1a6bb 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -919,7 +919,7 @@ static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { - ignore_loglevel = 1; + ignore_loglevel = true; pr_info("debug: ignoring loglevel setting.\n"); return 0; @@ -2005,12 +2005,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha return -1; } -bool console_suspend_enabled = 1; +bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) { - console_suspend_enabled = 0; + console_suspend_enabled = false; return 1; } __setup("no_console_suspend", console_suspend_disable); -- cgit v1.1 From ecfdb33d1fbc7e6e095ba24dac2930208494e734 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Apr 2014 14:44:49 -0400 Subject: acct: encode_comp_t(0) is 0, fortunately... There was an amusing bogosity in ac_rw calculation - it tried to do encode_comp_t(encode_comp_t(0) / 1024). Seeing that comp_t is a 3-bit exponent + 13-bit mantissa... it's a good thing that 0 is represented by all-bits-clear. The history of that one is interesting - it was introduced in 2.1.68pre1, when acct.c had been reworked and moved to separate file. Two months later (2.1.86) somebody has noticed that the sucker won't compile - there was no task_struct::io_usage. At which point the ac_io calculation had changed from encode_comp_t(current->io_usage) to encode_comp_t(0) and the bug in the next line (absolutely real back then, had it ever managed to compile) become a harmless bogosity. Looks like nobody has ever noticed until now. Anyway, let's bury that idiocy now that it got noticed. 17 years is long enough... Signed-off-by: Al Viro --- kernel/acct.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index a1844f1..807ebc5 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -531,9 +531,6 @@ static void do_acct_process(struct bsd_acct_struct *acct, ac.ac_majflt = encode_comp_t(pacct->ac_majflt); ac.ac_exitcode = pacct->ac_exitcode; spin_unlock_irq(¤t->sighand->siglock); - ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ - ac.ac_rw = encode_comp_t(ac.ac_io / 1024); - ac.ac_swaps = encode_comp_t(0); /* * Get freeze protection. If the fs is frozen, just skip the write -- cgit v1.1 From ed44724b79d8e03a40665436019cf22baba80d30 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Apr 2014 14:37:20 -0400 Subject: acct: switch to __kernel_write() Signed-off-by: Al Viro --- kernel/acct.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 807ebc5..8082d98 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -456,12 +456,16 @@ static void do_acct_process(struct bsd_acct_struct *acct, { struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; - mm_segment_t fs; unsigned long flim; u64 elapsed, run_time; struct tty_struct *tty; const struct cred *orig_cred; + /* + * Accounting records are not subject to resource limits. + */ + flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; /* Perform file operations on behalf of whoever enabled accounting */ orig_cred = override_creds(file->f_cred); @@ -536,25 +540,14 @@ static void do_acct_process(struct bsd_acct_struct *acct, * Get freeze protection. If the fs is frozen, just skip the write * as we could deadlock the system otherwise. */ - if (!file_start_write_trylock(file)) - goto out; - /* - * Kernel segment override to datasegment and write it - * to the accounting file. - */ - fs = get_fs(); - set_fs(KERNEL_DS); - /* - * Accounting records are not subject to resource limits. - */ - flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - file->f_op->write(file, (char *)&ac, - sizeof(acct_t), &file->f_pos); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; - set_fs(fs); - file_end_write(file); + if (file_start_write_trylock(file)) { + /* it's been opened O_APPEND, so position is irrelevant */ + loff_t pos = 0; + __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos); + file_end_write(file); + } out: + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; revert_creds(orig_cred); } -- cgit v1.1 From cdd37e23092c3c6fbbb2e611f8c3d18e676bf28f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 26 Apr 2014 23:45:53 -0400 Subject: separate namespace-independent parts of filling acct_t Signed-off-by: Al Viro --- kernel/acct.c | 98 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 51 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 8082d98..efa891b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -448,42 +448,20 @@ static u32 encode_float(u64 value) * do_exit() or when switching to a different output file. */ -/* - * do_acct_process does all actual work. Caller holds the reference to file. - */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *file) +static void fill_ac(acct_t *ac) { struct pacct_struct *pacct = ¤t->signal->pacct; - acct_t ac; - unsigned long flim; u64 elapsed, run_time; struct tty_struct *tty; - const struct cred *orig_cred; - - /* - * Accounting records are not subject to resource limits. - */ - flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - /* Perform file operations on behalf of whoever enabled accounting */ - orig_cred = override_creds(file->f_cred); - - /* - * First check to see if there is enough free_space to continue - * the process accounting system. - */ - if (!check_free_space(acct, file)) - goto out; /* * Fill the accounting struct with the needed info as recorded * by the different kernel functions. */ - memset(&ac, 0, sizeof(acct_t)); + memset(ac, 0, sizeof(acct_t)); - ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; - strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); + ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER; + strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm)); /* calculate run_time in nsec*/ run_time = ktime_get_ns(); @@ -491,27 +469,66 @@ static void do_acct_process(struct bsd_acct_struct *acct, /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); #if ACCT_VERSION==3 - ac.ac_etime = encode_float(elapsed); + ac->ac_etime = encode_float(elapsed); #else - ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? + ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? (unsigned long) elapsed : (unsigned long) -1l); #endif #if ACCT_VERSION==1 || ACCT_VERSION==2 { /* new enlarged etime field */ comp2_t etime = encode_comp2_t(elapsed); - ac.ac_etime_hi = etime >> 16; - ac.ac_etime_lo = (u16) etime; + ac->ac_etime_hi = etime >> 16; + ac->ac_etime_lo = (u16) etime; } #endif do_div(elapsed, AHZ); - ac.ac_btime = get_seconds() - elapsed; + ac->ac_btime = get_seconds() - elapsed; +#if ACCT_VERSION==2 + ac->ac_ahz = AHZ; +#endif + + spin_lock_irq(¤t->sighand->siglock); + tty = current->signal->tty; /* Safe as we hold the siglock */ + ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; + ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); + ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); + ac->ac_flag = pacct->ac_flag; + ac->ac_mem = encode_comp_t(pacct->ac_mem); + ac->ac_minflt = encode_comp_t(pacct->ac_minflt); + ac->ac_majflt = encode_comp_t(pacct->ac_majflt); + ac->ac_exitcode = pacct->ac_exitcode; + spin_unlock_irq(¤t->sighand->siglock); +} +/* + * do_acct_process does all actual work. Caller holds the reference to file. + */ +static void do_acct_process(struct bsd_acct_struct *acct, + struct pid_namespace *ns, struct file *file) +{ + acct_t ac; + unsigned long flim; + const struct cred *orig_cred; + + /* + * Accounting records are not subject to resource limits. + */ + flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + /* Perform file operations on behalf of whoever enabled accounting */ + orig_cred = override_creds(file->f_cred); + + /* + * First check to see if there is enough free_space to continue + * the process accounting system. + */ + if (!check_free_space(acct, file)) + goto out; + + fill_ac(&ac); /* we really need to bite the bullet and change layout */ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); -#if ACCT_VERSION==2 - ac.ac_ahz = AHZ; -#endif #if ACCT_VERSION==1 || ACCT_VERSION==2 /* backward-compatible 16 bit fields */ ac.ac_uid16 = ac.ac_uid; @@ -523,19 +540,6 @@ static void do_acct_process(struct bsd_acct_struct *acct, ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); rcu_read_unlock(); #endif - - spin_lock_irq(¤t->sighand->siglock); - tty = current->signal->tty; /* Safe as we hold the siglock */ - ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; - ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); - ac.ac_flag = pacct->ac_flag; - ac.ac_mem = encode_comp_t(pacct->ac_mem); - ac.ac_minflt = encode_comp_t(pacct->ac_minflt); - ac.ac_majflt = encode_comp_t(pacct->ac_majflt); - ac.ac_exitcode = pacct->ac_exitcode; - spin_unlock_irq(¤t->sighand->siglock); - /* * Get freeze protection. If the fs is frozen, just skip the write * as we could deadlock the system otherwise. -- cgit v1.1 From e25ff11ff16aba000dfe9e568d867e5142c31f16 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 May 2014 05:12:09 -0400 Subject: split the slow path in acct_process() off Signed-off-by: Al Viro --- kernel/acct.c | 50 ++++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index efa891b..5118860 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -599,34 +599,35 @@ void acct_collect(long exitcode, int group_dead) spin_unlock_irq(¤t->sighand->siglock); } -static void acct_process_in_ns(struct pid_namespace *ns) +static void slow_acct_process(struct pid_namespace *ns) { - struct file *file = NULL; - struct bsd_acct_struct *acct; + for ( ; ns; ns = ns->parent) { + struct file *file = NULL; + struct bsd_acct_struct *acct; - acct = ns->bacct; - /* - * accelerate the common fastpath: - */ - if (!acct || !acct->file) - return; + acct = ns->bacct; + /* + * accelerate the common fastpath: + */ + if (!acct || !acct->file) + continue; - spin_lock(&acct_lock); - file = acct->file; - if (unlikely(!file)) { + spin_lock(&acct_lock); + file = acct->file; + if (unlikely(!file)) { + spin_unlock(&acct_lock); + continue; + } + get_file(file); spin_unlock(&acct_lock); - return; - } - get_file(file); - spin_unlock(&acct_lock); - do_acct_process(acct, ns, file); - fput(file); + do_acct_process(acct, ns, file); + fput(file); + } } /** - * acct_process - now just a wrapper around acct_process_in_ns, - * which in turn is a wrapper around do_acct_process. + * acct_process * * handles process accounting for an exiting task */ @@ -639,6 +640,11 @@ void acct_process(void) * alive and holds its namespace, which in turn holds * its parent. */ - for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) - acct_process_in_ns(ns); + for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) { + struct bsd_acct_struct *acct = ns->bacct; + if (acct && acct->file) + break; + } + if (unlikely(ns)) + slow_acct_process(ns); } -- cgit v1.1 From 795a2f22a8eaf749e20a11271a8821bf04ac6d90 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 May 2014 05:23:41 -0400 Subject: acct() should honour the limits from the very beginning We need to check free space on the first write to freshly opened log. Signed-off-by: Al Viro --- kernel/acct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 5118860..8777372 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -180,8 +180,8 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, if (file) { acct->file = file; acct->ns = ns; - acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; - acct->active = 1; + acct->needcheck = jiffies; + acct->active = 0; list_add(&acct->list, &acct_list); } if (old_acct) { -- cgit v1.1 From 9df7fa16ee956bf0cdf4a711eac827be92d584bc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 May 2014 06:49:45 -0400 Subject: acct: serialize acct_on() brute-force - on a global mutex that isn't nested into anything. Signed-off-by: Al Viro --- kernel/acct.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 8777372..08963a2 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -241,6 +241,8 @@ static int acct_on(struct filename *pathname) return 0; } +static DEFINE_MUTEX(acct_on_mutex); + /** * sys_acct - enable/disable process accounting * @name: file name for accounting records or NULL to shutdown accounting @@ -263,7 +265,9 @@ SYSCALL_DEFINE1(acct, const char __user *, name) struct filename *tmp = getname(name); if (IS_ERR(tmp)) return PTR_ERR(tmp); + mutex_lock(&acct_on_mutex); error = acct_on(tmp); + mutex_unlock(&acct_on_mutex); putname(tmp); } else { struct bsd_acct_struct *acct; -- cgit v1.1 From b8f00e6be46f4c9a112e05fd692712873c4c4048 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 07:51:03 -0400 Subject: acct: new lifetime rules Do not reuse bsd_acct_struct after closing the damn thing. Structure lifetime is controlled by refcount now. We also have a mutex in there, held over closing and writing (the file is O_APPEND, so we are not losing any concurrency). As the result, we do not need to bother with get_file()/fput() on log write anymore. Moreover, do_acct_process() only needs acct itself; file and pidns are picked from it. Killed instances are distinguished by having NULL ->ns. Refcount is protected by acct_lock; anybody taking the mutex needs to grab a reference first. The things will get a lot simpler in the next commits - this is just the minimal chunk switching to the new lifetime rules. Signed-off-by: Al Viro --- kernel/acct.c | 220 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 114 insertions(+), 106 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 08963a2..f9ef9db 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -75,15 +75,11 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *); +static void do_acct_process(struct bsd_acct_struct *acct); -/* - * This structure is used so that all the data protected by lock - * can be placed in the same cache line as the lock. This primes - * the cache line to have the data after getting the lock. - */ struct bsd_acct_struct { + long count; + struct mutex lock; int active; unsigned long needcheck; struct file *file; @@ -157,39 +153,59 @@ out: return res; } -/* - * Close the old accounting file (if currently open) and then replace - * it with file (if non-NULL). - * - * NOTE: acct_lock MUST be held on entry and exit. - */ -static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, - struct pid_namespace *ns) +static void acct_put(struct bsd_acct_struct *p) { - struct file *old_acct = NULL; - struct pid_namespace *old_ns = NULL; - - if (acct->file) { - old_acct = acct->file; - old_ns = acct->ns; - acct->active = 0; - acct->file = NULL; - acct->ns = NULL; - list_del(&acct->list); - } - if (file) { - acct->file = file; - acct->ns = ns; - acct->needcheck = jiffies; - acct->active = 0; - list_add(&acct->list, &acct_list); + spin_lock(&acct_lock); + if (!--p->count) + kfree(p); + spin_unlock(&acct_lock); +} + +static struct bsd_acct_struct *acct_get(struct bsd_acct_struct **p) +{ + struct bsd_acct_struct *res; + spin_lock(&acct_lock); +again: + res = *p; + if (res) + res->count++; + spin_unlock(&acct_lock); + if (res) { + mutex_lock(&res->lock); + if (!res->ns) { + mutex_unlock(&res->lock); + spin_lock(&acct_lock); + if (!--res->count) + kfree(res); + goto again; + } } - if (old_acct) { - mnt_unpin(old_acct->f_path.mnt); + return res; +} + +static void acct_kill(struct bsd_acct_struct *acct, + struct bsd_acct_struct *new) +{ + if (acct) { + struct file *file = acct->file; + struct pid_namespace *ns = acct->ns; + spin_lock(&acct_lock); + list_del(&acct->list); + mnt_unpin(file->f_path.mnt); spin_unlock(&acct_lock); - do_acct_process(acct, old_ns, old_acct); - filp_close(old_acct, NULL); + do_acct_process(acct); + filp_close(file, NULL); spin_lock(&acct_lock); + ns->bacct = new; + if (new) { + mnt_pin(new->file->f_path.mnt); + list_add(&new->list, &acct_list); + } + acct->ns = NULL; + mutex_unlock(&acct->lock); + if (!(acct->count -= 2)) + kfree(acct); + spin_unlock(&acct_lock); } } @@ -197,47 +213,50 @@ static int acct_on(struct filename *pathname) { struct file *file; struct vfsmount *mnt; - struct pid_namespace *ns; - struct bsd_acct_struct *acct = NULL; + struct pid_namespace *ns = task_active_pid_ns(current); + struct bsd_acct_struct *acct, *old; + + acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); + if (!acct) + return -ENOMEM; /* Difference from BSD - they don't do O_APPEND */ file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); - if (IS_ERR(file)) + if (IS_ERR(file)) { + kfree(acct); return PTR_ERR(file); + } if (!S_ISREG(file_inode(file)->i_mode)) { + kfree(acct); filp_close(file, NULL); return -EACCES; } if (!file->f_op->write) { + kfree(acct); filp_close(file, NULL); return -EIO; } - ns = task_active_pid_ns(current); - if (ns->bacct == NULL) { - acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); - if (acct == NULL) { - filp_close(file, NULL); - return -ENOMEM; - } - } + acct->count = 1; + acct->file = file; + acct->needcheck = jiffies; + acct->ns = ns; + mutex_init(&acct->lock); + mnt = file->f_path.mnt; - spin_lock(&acct_lock); - if (ns->bacct == NULL) { + old = acct_get(&ns->bacct); + if (old) { + acct_kill(old, acct); + } else { + spin_lock(&acct_lock); ns->bacct = acct; - acct = NULL; + mnt_pin(mnt); + list_add(&acct->list, &acct_list); + spin_unlock(&acct_lock); } - - mnt = file->f_path.mnt; - mnt_pin(mnt); - acct_file_reopen(ns->bacct, file, ns); - spin_unlock(&acct_lock); - mntput(mnt); /* it's pinned, now give up active reference */ - kfree(acct); - return 0; } @@ -270,15 +289,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) mutex_unlock(&acct_on_mutex); putname(tmp); } else { - struct bsd_acct_struct *acct; - - acct = task_active_pid_ns(current)->bacct; - if (acct == NULL) - return 0; - - spin_lock(&acct_lock); - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); + acct_kill(acct_get(&task_active_pid_ns(current)->bacct), NULL); } return error; @@ -298,8 +309,19 @@ void acct_auto_close_mnt(struct vfsmount *m) spin_lock(&acct_lock); restart: list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.mnt == m) { - acct_file_reopen(acct, NULL, NULL); + if (acct->file->f_path.mnt == m) { + acct->count++; + spin_unlock(&acct_lock); + mutex_lock(&acct->lock); + if (!acct->ns) { + mutex_unlock(&acct->lock); + spin_lock(&acct_lock); + if (!--acct->count) + kfree(acct); + goto restart; + } + acct_kill(acct, NULL); + spin_lock(&acct_lock); goto restart; } spin_unlock(&acct_lock); @@ -319,8 +341,19 @@ void acct_auto_close(struct super_block *sb) spin_lock(&acct_lock); restart: list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.dentry->d_sb == sb) { - acct_file_reopen(acct, NULL, NULL); + if (acct->file->f_path.dentry->d_sb == sb) { + acct->count++; + spin_unlock(&acct_lock); + mutex_lock(&acct->lock); + if (!acct->ns) { + mutex_unlock(&acct->lock); + spin_lock(&acct_lock); + if (!--acct->count) + kfree(acct); + goto restart; + } + acct_kill(acct, NULL); + spin_lock(&acct_lock); goto restart; } spin_unlock(&acct_lock); @@ -328,17 +361,7 @@ restart: void acct_exit_ns(struct pid_namespace *ns) { - struct bsd_acct_struct *acct = ns->bacct; - - if (acct == NULL) - return; - - spin_lock(&acct_lock); - if (acct->file != NULL) - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); - - kfree(acct); + acct_kill(acct_get(&ns->bacct), NULL); } /* @@ -507,12 +530,13 @@ static void fill_ac(acct_t *ac) /* * do_acct_process does all actual work. Caller holds the reference to file. */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *file) +static void do_acct_process(struct bsd_acct_struct *acct) { acct_t ac; unsigned long flim; const struct cred *orig_cred; + struct pid_namespace *ns = acct->ns; + struct file *file = acct->file; /* * Accounting records are not subject to resource limits. @@ -606,27 +630,12 @@ void acct_collect(long exitcode, int group_dead) static void slow_acct_process(struct pid_namespace *ns) { for ( ; ns; ns = ns->parent) { - struct file *file = NULL; - struct bsd_acct_struct *acct; - - acct = ns->bacct; - /* - * accelerate the common fastpath: - */ - if (!acct || !acct->file) - continue; - - spin_lock(&acct_lock); - file = acct->file; - if (unlikely(!file)) { - spin_unlock(&acct_lock); - continue; + struct bsd_acct_struct *acct = acct_get(&ns->bacct); + if (acct) { + do_acct_process(acct); + mutex_unlock(&acct->lock); + acct_put(acct); } - get_file(file); - spin_unlock(&acct_lock); - - do_acct_process(acct, ns, file); - fput(file); } } @@ -645,8 +654,7 @@ void acct_process(void) * its parent. */ for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) { - struct bsd_acct_struct *acct = ns->bacct; - if (acct && acct->file) + if (ns->bacct) break; } if (unlikely(ns)) -- cgit v1.1 From 54a4d58a6459a93fc6ee898354b3d2ffb80dd03a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Apr 2014 14:24:18 -0400 Subject: acct: simplify check_free_space() a) file can't be NULL b) file can't be changed under us c) all writes are serialized by acct->lock; no need to mess with spinlock there. Signed-off-by: Al Viro --- kernel/acct.c | 50 +++++++++++--------------------------------------- 1 file changed, 11 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index f9ef9db..019f012 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -93,64 +93,36 @@ static LIST_HEAD(acct_list); /* * Check the amount of free space and suspend/resume accordingly. */ -static int check_free_space(struct bsd_acct_struct *acct, struct file *file) +static int check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - int res; - int act; - u64 resume; - u64 suspend; - spin_lock(&acct_lock); - res = acct->active; - if (!file || time_is_before_jiffies(acct->needcheck)) + if (time_is_before_jiffies(acct->needcheck)) goto out; - spin_unlock(&acct_lock); /* May block */ - if (vfs_statfs(&file->f_path, &sbuf)) - return res; - suspend = sbuf.f_blocks * SUSPEND; - resume = sbuf.f_blocks * RESUME; - - do_div(suspend, 100); - do_div(resume, 100); - - if (sbuf.f_bavail <= suspend) - act = -1; - else if (sbuf.f_bavail >= resume) - act = 1; - else - act = 0; - - /* - * If some joker switched acct->file under us we'ld better be - * silent and _not_ touch anything. - */ - spin_lock(&acct_lock); - if (file != acct->file) { - if (act) - res = act > 0; + if (vfs_statfs(&acct->file->f_path, &sbuf)) goto out; - } if (acct->active) { - if (act < 0) { + u64 suspend = sbuf.f_blocks * SUSPEND; + do_div(suspend, 100); + if (sbuf.f_bavail <= suspend) { acct->active = 0; printk(KERN_INFO "Process accounting paused\n"); } } else { - if (act > 0) { + u64 resume = sbuf.f_blocks * RESUME; + do_div(resume, 100); + if (sbuf.f_bavail >= resume) { acct->active = 1; printk(KERN_INFO "Process accounting resumed\n"); } } acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; - res = acct->active; out: - spin_unlock(&acct_lock); - return res; + return acct->active; } static void acct_put(struct bsd_acct_struct *p) @@ -550,7 +522,7 @@ static void do_acct_process(struct bsd_acct_struct *acct) * First check to see if there is enough free_space to continue * the process accounting system. */ - if (!check_free_space(acct, file)) + if (!check_free_space(acct)) goto out; fill_ac(&ac); -- cgit v1.1 From 215752fce31c80f3b3a1530bc7cddb3ba6a69b3a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 06:23:41 -0400 Subject: acct: get rid of acct_list Put these suckers on per-vfsmount and per-superblock lists instead. Note: right now it's still acct_lock for everything, but that's going to change. Signed-off-by: Al Viro --- kernel/acct.c | 135 ++++++++++++++++++++++++---------------------------------- 1 file changed, 55 insertions(+), 80 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 019f012..21fbb3c 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -59,6 +59,7 @@ #include #include /* sector_div */ #include +#include <../fs/mount.h> /* will go away when we refactor */ /* * These constants control the amount of freespace that suspend and @@ -79,16 +80,16 @@ static void do_acct_process(struct bsd_acct_struct *acct); struct bsd_acct_struct { long count; + struct hlist_node s_list; + struct hlist_node m_list; struct mutex lock; int active; unsigned long needcheck; struct file *file; struct pid_namespace *ns; - struct list_head list; }; static DEFINE_SPINLOCK(acct_lock); -static LIST_HEAD(acct_list); /* * Check the amount of free space and suspend/resume accordingly. @@ -133,25 +134,33 @@ static void acct_put(struct bsd_acct_struct *p) spin_unlock(&acct_lock); } -static struct bsd_acct_struct *acct_get(struct bsd_acct_struct **p) +static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) +{ + res->count++; + spin_unlock(&acct_lock); + mutex_lock(&res->lock); + if (!res->ns) { + mutex_unlock(&res->lock); + spin_lock(&acct_lock); + if (!--res->count) + kfree(res); + return NULL; + } + return res; +} + +static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) { struct bsd_acct_struct *res; spin_lock(&acct_lock); again: - res = *p; - if (res) - res->count++; - spin_unlock(&acct_lock); - if (res) { - mutex_lock(&res->lock); - if (!res->ns) { - mutex_unlock(&res->lock); - spin_lock(&acct_lock); - if (!--res->count) - kfree(res); - goto again; - } + if (!ns->bacct) { + spin_unlock(&acct_lock); + return NULL; } + res = __acct_get(ns->bacct); + if (!res) + goto again; return res; } @@ -162,7 +171,8 @@ static void acct_kill(struct bsd_acct_struct *acct, struct file *file = acct->file; struct pid_namespace *ns = acct->ns; spin_lock(&acct_lock); - list_del(&acct->list); + hlist_del(&acct->m_list); + hlist_del(&acct->s_list); mnt_unpin(file->f_path.mnt); spin_unlock(&acct_lock); do_acct_process(acct); @@ -170,8 +180,10 @@ static void acct_kill(struct bsd_acct_struct *acct, spin_lock(&acct_lock); ns->bacct = new; if (new) { - mnt_pin(new->file->f_path.mnt); - list_add(&new->list, &acct_list); + struct vfsmount *m = new->file->f_path.mnt; + mnt_pin(m); + hlist_add_head(&new->s_list, &m->mnt_sb->s_pins); + hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins); } acct->ns = NULL; mutex_unlock(&acct->lock); @@ -218,14 +230,15 @@ static int acct_on(struct filename *pathname) mutex_init(&acct->lock); mnt = file->f_path.mnt; - old = acct_get(&ns->bacct); + old = acct_get(ns); if (old) { acct_kill(old, acct); } else { spin_lock(&acct_lock); ns->bacct = acct; mnt_pin(mnt); - list_add(&acct->list, &acct_list); + hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins); + hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins); spin_unlock(&acct_lock); } mntput(mnt); /* it's pinned, now give up active reference */ @@ -261,79 +274,41 @@ SYSCALL_DEFINE1(acct, const char __user *, name) mutex_unlock(&acct_on_mutex); putname(tmp); } else { - acct_kill(acct_get(&task_active_pid_ns(current)->bacct), NULL); + acct_kill(acct_get(task_active_pid_ns(current)), NULL); } return error; } -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @m: vfsmount being shut down - * - * If the accounting is turned on for a file in the subtree pointed to - * to by m, turn accounting off. Done when m is about to die. - */ -void acct_auto_close_mnt(struct vfsmount *m) +void acct_auto_close_mnt(struct hlist_head *list) { - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file->f_path.mnt == m) { - acct->count++; - spin_unlock(&acct_lock); - mutex_lock(&acct->lock); - if (!acct->ns) { - mutex_unlock(&acct->lock); - spin_lock(&acct_lock); - if (!--acct->count) - kfree(acct); - goto restart; - } - acct_kill(acct, NULL); - spin_lock(&acct_lock); - goto restart; - } + while (1) { + spin_lock(&acct_lock); + if (!list->first) + break; + acct_kill(__acct_get(hlist_entry(list->first, + struct bsd_acct_struct, + m_list)), NULL); + } spin_unlock(&acct_lock); } -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @sb: super block for the filesystem - * - * If the accounting is turned on for a file in the filesystem pointed - * to by sb, turn accounting off. - */ -void acct_auto_close(struct super_block *sb) +void acct_auto_close(struct hlist_head *list) { - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file->f_path.dentry->d_sb == sb) { - acct->count++; - spin_unlock(&acct_lock); - mutex_lock(&acct->lock); - if (!acct->ns) { - mutex_unlock(&acct->lock); - spin_lock(&acct_lock); - if (!--acct->count) - kfree(acct); - goto restart; - } - acct_kill(acct, NULL); - spin_lock(&acct_lock); - goto restart; - } + while (1) { + spin_lock(&acct_lock); + if (!list->first) + break; + acct_kill(__acct_get(hlist_entry(list->first, + struct bsd_acct_struct, + s_list)), NULL); + } spin_unlock(&acct_lock); } void acct_exit_ns(struct pid_namespace *ns) { - acct_kill(acct_get(&ns->bacct), NULL); + acct_kill(acct_get(ns), NULL); } /* @@ -602,7 +577,7 @@ void acct_collect(long exitcode, int group_dead) static void slow_acct_process(struct pid_namespace *ns) { for ( ; ns; ns = ns->parent) { - struct bsd_acct_struct *acct = acct_get(&ns->bacct); + struct bsd_acct_struct *acct = acct_get(ns); if (acct) { do_acct_process(acct); mutex_unlock(&acct->lock); -- cgit v1.1 From 2798d4ce61601808b965253d60624bbf201b51b0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 07:04:28 -0400 Subject: acct: get rid of acct_lock for acct->count * make acct->count atomic and acct freeing - rcu-delayed. * instead of grabbing acct_lock around the places where we take a reference, do that under rcu_read_lock() with atomic_long_inc_not_zero(). * have the new acct locked before making ns->bacct point to it Signed-off-by: Al Viro --- kernel/acct.c | 85 ++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 21fbb3c..6fd375f1 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -79,9 +79,14 @@ int acct_parm[3] = {4, 2, 30}; static void do_acct_process(struct bsd_acct_struct *acct); struct bsd_acct_struct { - long count; - struct hlist_node s_list; - struct hlist_node m_list; + atomic_long_t count; + union { + struct { + struct hlist_node s_list; + struct hlist_node m_list; + }; + struct rcu_head rcu; + }; struct mutex lock; int active; unsigned long needcheck; @@ -89,6 +94,11 @@ struct bsd_acct_struct { struct pid_namespace *ns; }; +static void acct_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct bsd_acct_struct, rcu)); +} + static DEFINE_SPINLOCK(acct_lock); /* @@ -128,22 +138,22 @@ out: static void acct_put(struct bsd_acct_struct *p) { - spin_lock(&acct_lock); - if (!--p->count) - kfree(p); - spin_unlock(&acct_lock); + if (atomic_long_dec_and_test(&p->count)) + call_rcu(&p->rcu, acct_free_rcu); } static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) { - res->count++; - spin_unlock(&acct_lock); + if (!atomic_long_inc_not_zero(&res->count)) { + rcu_read_unlock(); + cpu_relax(); + return NULL; + } + rcu_read_unlock(); mutex_lock(&res->lock); if (!res->ns) { mutex_unlock(&res->lock); - spin_lock(&acct_lock); - if (!--res->count) - kfree(res); + acct_put(res); return NULL; } return res; @@ -152,13 +162,15 @@ static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) { struct bsd_acct_struct *res; - spin_lock(&acct_lock); again: - if (!ns->bacct) { - spin_unlock(&acct_lock); + smp_rmb(); + rcu_read_lock(); + res = ACCESS_ONCE(ns->bacct); + if (!res) { + rcu_read_unlock(); return NULL; } - res = __acct_get(ns->bacct); + res = __acct_get(res); if (!res) goto again; return res; @@ -170,26 +182,27 @@ static void acct_kill(struct bsd_acct_struct *acct, if (acct) { struct file *file = acct->file; struct pid_namespace *ns = acct->ns; + do_acct_process(acct); + mnt_unpin(file->f_path.mnt); + filp_close(file, NULL); spin_lock(&acct_lock); hlist_del(&acct->m_list); hlist_del(&acct->s_list); - mnt_unpin(file->f_path.mnt); spin_unlock(&acct_lock); - do_acct_process(acct); - filp_close(file, NULL); - spin_lock(&acct_lock); ns->bacct = new; if (new) { struct vfsmount *m = new->file->f_path.mnt; mnt_pin(m); + spin_lock(&acct_lock); hlist_add_head(&new->s_list, &m->mnt_sb->s_pins); hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins); + spin_unlock(&acct_lock); + mutex_unlock(&new->lock); } acct->ns = NULL; + atomic_long_dec(&acct->count); mutex_unlock(&acct->lock); - if (!(acct->count -= 2)) - kfree(acct); - spin_unlock(&acct_lock); + acct_put(acct); } } @@ -223,7 +236,7 @@ static int acct_on(struct filename *pathname) return -EIO; } - acct->count = 1; + atomic_long_set(&acct->count, 1); acct->file = file; acct->needcheck = jiffies; acct->ns = ns; @@ -231,15 +244,17 @@ static int acct_on(struct filename *pathname) mnt = file->f_path.mnt; old = acct_get(ns); + mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ if (old) { acct_kill(old, acct); } else { - spin_lock(&acct_lock); ns->bacct = acct; + spin_lock(&acct_lock); mnt_pin(mnt); hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins); hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins); spin_unlock(&acct_lock); + mutex_unlock(&acct->lock); } mntput(mnt); /* it's pinned, now give up active reference */ return 0; @@ -282,28 +297,32 @@ SYSCALL_DEFINE1(acct, const char __user *, name) void acct_auto_close_mnt(struct hlist_head *list) { + rcu_read_lock(); while (1) { - spin_lock(&acct_lock); - if (!list->first) + struct hlist_node *p = ACCESS_ONCE(list->first); + if (!p) break; - acct_kill(__acct_get(hlist_entry(list->first, + acct_kill(__acct_get(hlist_entry(p, struct bsd_acct_struct, m_list)), NULL); + rcu_read_lock(); } - spin_unlock(&acct_lock); + rcu_read_unlock(); } void acct_auto_close(struct hlist_head *list) { + rcu_read_lock(); while (1) { - spin_lock(&acct_lock); - if (!list->first) + struct hlist_node *p = ACCESS_ONCE(list->first); + if (!p) break; - acct_kill(__acct_get(hlist_entry(list->first, + acct_kill(__acct_get(hlist_entry(p, struct bsd_acct_struct, s_list)), NULL); + rcu_read_lock(); } - spin_unlock(&acct_lock); + rcu_read_unlock(); } void acct_exit_ns(struct pid_namespace *ns) -- cgit v1.1 From 17c0a5aaffa63da6b5c73a31e36616bdcd12d143 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 07:35:19 -0400 Subject: make acct_kill() wait for file closing. Do actual closing of file via schedule_work(). And use __fput_sync() there. Signed-off-by: Al Viro --- kernel/acct.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 6fd375f1..d9ebc96 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -92,6 +92,8 @@ struct bsd_acct_struct { unsigned long needcheck; struct file *file; struct pid_namespace *ns; + struct work_struct work; + struct completion done; }; static void acct_free_rcu(struct rcu_head *head) @@ -176,15 +178,27 @@ again: return res; } +static void close_work(struct work_struct *work) +{ + struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); + struct file *file = acct->file; + mnt_unpin(file->f_path.mnt); + if (file->f_op->flush) + file->f_op->flush(file, NULL); + __fput_sync(file); + complete(&acct->done); +} + static void acct_kill(struct bsd_acct_struct *acct, struct bsd_acct_struct *new) { if (acct) { - struct file *file = acct->file; struct pid_namespace *ns = acct->ns; do_acct_process(acct); - mnt_unpin(file->f_path.mnt); - filp_close(file, NULL); + INIT_WORK(&acct->work, close_work); + init_completion(&acct->done); + schedule_work(&acct->work); + wait_for_completion(&acct->done); spin_lock(&acct_lock); hlist_del(&acct->m_list); hlist_del(&acct->s_list); -- cgit v1.1 From 215748e67d893169de9e62c3416e9e035e9e9c5f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 07:51:29 -0400 Subject: acct: move mnt_pin() upwards. Signed-off-by: Al Viro --- kernel/acct.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index d9ebc96..2d9e04d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -206,7 +206,6 @@ static void acct_kill(struct bsd_acct_struct *acct, ns->bacct = new; if (new) { struct vfsmount *m = new->file->f_path.mnt; - mnt_pin(m); spin_lock(&acct_lock); hlist_add_head(&new->s_list, &m->mnt_sb->s_pins); hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins); @@ -256,6 +255,7 @@ static int acct_on(struct filename *pathname) acct->ns = ns; mutex_init(&acct->lock); mnt = file->f_path.mnt; + mnt_pin(mnt); old = acct_get(ns); mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ @@ -264,7 +264,6 @@ static int acct_on(struct filename *pathname) } else { ns->bacct = acct; spin_lock(&acct_lock); - mnt_pin(mnt); hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins); hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins); spin_unlock(&acct_lock); -- cgit v1.1 From 1629d0eb3ead0e0c49e4402049ec7b5b31b81cd7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 08:00:52 -0400 Subject: start carving bsd_acct_struct up pull generic parts into struct fs_pin. Eventually we want those to replace mnt_pin()/mnt_unpin() mess; that stuff will move to fs/*. Signed-off-by: Al Viro --- kernel/acct.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 2d9e04d..afeaaa6 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -78,7 +78,7 @@ int acct_parm[3] = {4, 2, 30}; */ static void do_acct_process(struct bsd_acct_struct *acct); -struct bsd_acct_struct { +struct fs_pin { atomic_long_t count; union { struct { @@ -87,6 +87,10 @@ struct bsd_acct_struct { }; struct rcu_head rcu; }; +}; + +struct bsd_acct_struct { + struct fs_pin pin; struct mutex lock; int active; unsigned long needcheck; @@ -96,9 +100,9 @@ struct bsd_acct_struct { struct completion done; }; -static void acct_free_rcu(struct rcu_head *head) +static void pin_free_rcu(struct rcu_head *head) { - kfree(container_of(head, struct bsd_acct_struct, rcu)); + kfree(container_of(head, struct fs_pin, rcu)); } static DEFINE_SPINLOCK(acct_lock); @@ -138,15 +142,15 @@ out: return acct->active; } -static void acct_put(struct bsd_acct_struct *p) +static void pin_put(struct fs_pin *p) { if (atomic_long_dec_and_test(&p->count)) - call_rcu(&p->rcu, acct_free_rcu); + call_rcu(&p->rcu, pin_free_rcu); } static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) { - if (!atomic_long_inc_not_zero(&res->count)) { + if (!atomic_long_inc_not_zero(&res->pin.count)) { rcu_read_unlock(); cpu_relax(); return NULL; @@ -155,7 +159,7 @@ static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) mutex_lock(&res->lock); if (!res->ns) { mutex_unlock(&res->lock); - acct_put(res); + pin_put(&res->pin); return NULL; } return res; @@ -200,22 +204,22 @@ static void acct_kill(struct bsd_acct_struct *acct, schedule_work(&acct->work); wait_for_completion(&acct->done); spin_lock(&acct_lock); - hlist_del(&acct->m_list); - hlist_del(&acct->s_list); + hlist_del(&acct->pin.m_list); + hlist_del(&acct->pin.s_list); spin_unlock(&acct_lock); ns->bacct = new; if (new) { struct vfsmount *m = new->file->f_path.mnt; spin_lock(&acct_lock); - hlist_add_head(&new->s_list, &m->mnt_sb->s_pins); - hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins); + hlist_add_head(&new->pin.s_list, &m->mnt_sb->s_pins); + hlist_add_head(&new->pin.m_list, &real_mount(m)->mnt_pins); spin_unlock(&acct_lock); mutex_unlock(&new->lock); } acct->ns = NULL; - atomic_long_dec(&acct->count); + atomic_long_dec(&acct->pin.count); mutex_unlock(&acct->lock); - acct_put(acct); + pin_put(&acct->pin); } } @@ -249,7 +253,7 @@ static int acct_on(struct filename *pathname) return -EIO; } - atomic_long_set(&acct->count, 1); + atomic_long_set(&acct->pin.count, 1); acct->file = file; acct->needcheck = jiffies; acct->ns = ns; @@ -264,8 +268,8 @@ static int acct_on(struct filename *pathname) } else { ns->bacct = acct; spin_lock(&acct_lock); - hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins); - hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins); + hlist_add_head(&acct->pin.s_list, &mnt->mnt_sb->s_pins); + hlist_add_head(&acct->pin.m_list, &real_mount(mnt)->mnt_pins); spin_unlock(&acct_lock); mutex_unlock(&acct->lock); } @@ -317,7 +321,7 @@ void acct_auto_close_mnt(struct hlist_head *list) break; acct_kill(__acct_get(hlist_entry(p, struct bsd_acct_struct, - m_list)), NULL); + pin.m_list)), NULL); rcu_read_lock(); } rcu_read_unlock(); @@ -332,7 +336,7 @@ void acct_auto_close(struct hlist_head *list) break; acct_kill(__acct_get(hlist_entry(p, struct bsd_acct_struct, - s_list)), NULL); + pin.s_list)), NULL); rcu_read_lock(); } rcu_read_unlock(); @@ -613,7 +617,7 @@ static void slow_acct_process(struct pid_namespace *ns) if (acct) { do_acct_process(acct); mutex_unlock(&acct->lock); - acct_put(acct); + pin_put(&acct->pin); } } } -- cgit v1.1 From efb170c22867cdc6f770de441bdefecec6712199 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 08:39:04 -0400 Subject: take fs_pin stuff to fs/* Add a new field to fs_pin - kill(pin). That's what umount and r/o remount will be calling for all pins attached to vfsmount and superblock resp. Called after bumping the refcount, so it won't go away under us. Dropping the refcount is responsibility of the instance. All generic stuff moved to fs/fs_pin.c; the next step will rip all the knowledge of kernel/acct.c from fs/super.c and fs/namespace.c. After that - death to mnt_pin(); it was intended to be usable as generic mechanism for code that wants to attach objects to vfsmount, so that they would not make the sucker busy and would get killed on umount. Never got it right; it remained acct.c-specific all along. Now it's very close to being killable. Signed-off-by: Al Viro --- kernel/acct.c | 127 +++++++++++++++------------------------------------------- 1 file changed, 32 insertions(+), 95 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index afeaaa6..a7993a6 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -59,7 +59,7 @@ #include #include /* sector_div */ #include -#include <../fs/mount.h> /* will go away when we refactor */ +#include /* * These constants control the amount of freespace that suspend and @@ -78,17 +78,6 @@ int acct_parm[3] = {4, 2, 30}; */ static void do_acct_process(struct bsd_acct_struct *acct); -struct fs_pin { - atomic_long_t count; - union { - struct { - struct hlist_node s_list; - struct hlist_node m_list; - }; - struct rcu_head rcu; - }; -}; - struct bsd_acct_struct { struct fs_pin pin; struct mutex lock; @@ -100,13 +89,6 @@ struct bsd_acct_struct { struct completion done; }; -static void pin_free_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct fs_pin, rcu)); -} - -static DEFINE_SPINLOCK(acct_lock); - /* * Check the amount of free space and suspend/resume accordingly. */ @@ -142,29 +124,6 @@ out: return acct->active; } -static void pin_put(struct fs_pin *p) -{ - if (atomic_long_dec_and_test(&p->count)) - call_rcu(&p->rcu, pin_free_rcu); -} - -static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) -{ - if (!atomic_long_inc_not_zero(&res->pin.count)) { - rcu_read_unlock(); - cpu_relax(); - return NULL; - } - rcu_read_unlock(); - mutex_lock(&res->lock); - if (!res->ns) { - mutex_unlock(&res->lock); - pin_put(&res->pin); - return NULL; - } - return res; -} - static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) { struct bsd_acct_struct *res; @@ -176,9 +135,18 @@ again: rcu_read_unlock(); return NULL; } - res = __acct_get(res); - if (!res) + if (!atomic_long_inc_not_zero(&res->pin.count)) { + rcu_read_unlock(); + cpu_relax(); goto again; + } + rcu_read_unlock(); + mutex_lock(&res->lock); + if (!res->ns) { + mutex_unlock(&res->lock); + pin_put(&res->pin); + goto again; + } return res; } @@ -203,19 +171,8 @@ static void acct_kill(struct bsd_acct_struct *acct, init_completion(&acct->done); schedule_work(&acct->work); wait_for_completion(&acct->done); - spin_lock(&acct_lock); - hlist_del(&acct->pin.m_list); - hlist_del(&acct->pin.s_list); - spin_unlock(&acct_lock); + pin_remove(&acct->pin); ns->bacct = new; - if (new) { - struct vfsmount *m = new->file->f_path.mnt; - spin_lock(&acct_lock); - hlist_add_head(&new->pin.s_list, &m->mnt_sb->s_pins); - hlist_add_head(&new->pin.m_list, &real_mount(m)->mnt_pins); - spin_unlock(&acct_lock); - mutex_unlock(&new->lock); - } acct->ns = NULL; atomic_long_dec(&acct->pin.count); mutex_unlock(&acct->lock); @@ -223,6 +180,19 @@ static void acct_kill(struct bsd_acct_struct *acct, } } +static void acct_pin_kill(struct fs_pin *pin) +{ + struct bsd_acct_struct *acct; + acct = container_of(pin, struct bsd_acct_struct, pin); + mutex_lock(&acct->lock); + if (!acct->ns) { + mutex_unlock(&acct->lock); + pin_put(pin); + acct = NULL; + } + acct_kill(acct, NULL); +} + static int acct_on(struct filename *pathname) { struct file *file; @@ -254,25 +224,22 @@ static int acct_on(struct filename *pathname) } atomic_long_set(&acct->pin.count, 1); + acct->pin.kill = acct_pin_kill; acct->file = file; acct->needcheck = jiffies; acct->ns = ns; mutex_init(&acct->lock); mnt = file->f_path.mnt; mnt_pin(mnt); + mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ + pin_insert(&acct->pin, mnt); old = acct_get(ns); - mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ - if (old) { + if (old) acct_kill(old, acct); - } else { + else ns->bacct = acct; - spin_lock(&acct_lock); - hlist_add_head(&acct->pin.s_list, &mnt->mnt_sb->s_pins); - hlist_add_head(&acct->pin.m_list, &real_mount(mnt)->mnt_pins); - spin_unlock(&acct_lock); - mutex_unlock(&acct->lock); - } + mutex_unlock(&acct->lock); mntput(mnt); /* it's pinned, now give up active reference */ return 0; } @@ -312,36 +279,6 @@ SYSCALL_DEFINE1(acct, const char __user *, name) return error; } -void acct_auto_close_mnt(struct hlist_head *list) -{ - rcu_read_lock(); - while (1) { - struct hlist_node *p = ACCESS_ONCE(list->first); - if (!p) - break; - acct_kill(__acct_get(hlist_entry(p, - struct bsd_acct_struct, - pin.m_list)), NULL); - rcu_read_lock(); - } - rcu_read_unlock(); -} - -void acct_auto_close(struct hlist_head *list) -{ - rcu_read_lock(); - while (1) { - struct hlist_node *p = ACCESS_ONCE(list->first); - if (!p) - break; - acct_kill(__acct_get(hlist_entry(p, - struct bsd_acct_struct, - pin.s_list)), NULL); - rcu_read_lock(); - } - rcu_read_unlock(); -} - void acct_exit_ns(struct pid_namespace *ns) { acct_kill(acct_get(ns), NULL); -- cgit v1.1 From 3064c3563ba4c23e2c7a47254ec056ed9ba0098a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 09:12:31 -0400 Subject: death to mnt_pinned Rather than playing silly buggers with vfsmount refcounts, just have acct_on() ask fs/namespace.c for internal clone of file->f_path.mnt and replace it with said clone. Then attach the pin to original vfsmount. Voila - the clone will be alive until the file gets closed, making sure that underlying superblock remains active, etc., and we can drop the original vfsmount, so that it's not kept busy. If the file lives until the final mntput of the original vfsmount, we'll notice that there's an fs_pin (one in bsd_acct_struct that holds that file) and mnt_pin_kill() will take it out. Since ->kill() is synchronous, we won't proceed past that point until these files are closed (and private clones of our vfsmount are gone), so we get the same ordering warranties we used to get. mnt_pin()/mnt_unpin()/->mnt_pinned is gone now, and good riddance - it never became usable outside of kernel/acct.c (and racy wrt umount even there). Signed-off-by: Al Viro --- kernel/acct.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index a7993a6..2e6cf818 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -154,7 +154,6 @@ static void close_work(struct work_struct *work) { struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); struct file *file = acct->file; - mnt_unpin(file->f_path.mnt); if (file->f_op->flush) file->f_op->flush(file, NULL); __fput_sync(file); @@ -196,9 +195,10 @@ static void acct_pin_kill(struct fs_pin *pin) static int acct_on(struct filename *pathname) { struct file *file; - struct vfsmount *mnt; + struct vfsmount *mnt, *internal; struct pid_namespace *ns = task_active_pid_ns(current); struct bsd_acct_struct *acct, *old; + int err; acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); if (!acct) @@ -222,6 +222,21 @@ static int acct_on(struct filename *pathname) filp_close(file, NULL); return -EIO; } + internal = mnt_clone_internal(&file->f_path); + if (IS_ERR(internal)) { + kfree(acct); + filp_close(file, NULL); + return PTR_ERR(internal); + } + err = mnt_want_write(internal); + if (err) { + mntput(internal); + kfree(acct); + filp_close(file, NULL); + return err; + } + mnt = file->f_path.mnt; + file->f_path.mnt = internal; atomic_long_set(&acct->pin.count, 1); acct->pin.kill = acct_pin_kill; @@ -229,8 +244,6 @@ static int acct_on(struct filename *pathname) acct->needcheck = jiffies; acct->ns = ns; mutex_init(&acct->lock); - mnt = file->f_path.mnt; - mnt_pin(mnt); mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ pin_insert(&acct->pin, mnt); @@ -240,7 +253,8 @@ static int acct_on(struct filename *pathname) else ns->bacct = acct; mutex_unlock(&acct->lock); - mntput(mnt); /* it's pinned, now give up active reference */ + mnt_drop_write(mnt); + mntput(mnt); return 0; } -- cgit v1.1 From 2577d92ebd28dd9b3dacdfad6dcd81be0d21bbdf Mon Sep 17 00:00:00 2001 From: Ionut Alexa Date: Thu, 31 Jul 2014 09:28:36 +1000 Subject: kernel/acct.c: fix coding style warnings and errors Signed-off-by: Ionut Alexa Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- kernel/acct.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 2e6cf818..b4c667d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -108,14 +108,14 @@ static int check_free_space(struct bsd_acct_struct *acct) do_div(suspend, 100); if (sbuf.f_bavail <= suspend) { acct->active = 0; - printk(KERN_INFO "Process accounting paused\n"); + pr_info("Process accounting paused\n"); } } else { u64 resume = sbuf.f_blocks * RESUME; do_div(resume, 100); if (sbuf.f_bavail >= resume) { acct->active = 1; - printk(KERN_INFO "Process accounting resumed\n"); + pr_info("Process accounting resumed\n"); } } @@ -280,6 +280,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (name) { struct filename *tmp = getname(name); + if (IS_ERR(tmp)) return PTR_ERR(tmp); mutex_lock(&acct_on_mutex); @@ -337,7 +338,7 @@ static comp_t encode_comp_t(unsigned long value) return exp; } -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* * encode an u64 into a comp2_t (24 bits) * @@ -350,7 +351,7 @@ static comp_t encode_comp_t(unsigned long value) #define MANTSIZE2 20 /* 20 bit mantissa. */ #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ -#define MAXEXP2 ((1 < 0){ + if (value == 0) + return 0; + while ((s64)value > 0) { value <<= 1; exp--; } @@ -429,16 +431,17 @@ static void fill_ac(acct_t *ac) run_time -= current->group_leader->start_time; /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac->ac_etime = encode_float(elapsed); #else ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? - (unsigned long) elapsed : (unsigned long) -1l); + (unsigned long) elapsed : (unsigned long) -1l); #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 { /* new enlarged etime field */ comp2_t etime = encode_comp2_t(elapsed); + ac->ac_etime_hi = etime >> 16; ac->ac_etime_lo = (u16) etime; } @@ -491,12 +494,12 @@ static void do_acct_process(struct bsd_acct_struct *acct) /* we really need to bite the bullet and change layout */ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ ac.ac_uid16 = ac.ac_uid; ac.ac_gid16 = ac.ac_gid; #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac.ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); @@ -530,6 +533,7 @@ void acct_collect(long exitcode, int group_dead) if (group_dead && current->mm) { struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); vma = current->mm->mmap; while (vma) { -- cgit v1.1 From 00501b531c4723972aa11d6d4ebcf8d6552007c8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 8 Aug 2014 14:19:20 -0700 Subject: mm: memcontrol: rewrite charge API These patches rework memcg charge lifetime to integrate more naturally with the lifetime of user pages. This drastically simplifies the code and reduces charging and uncharging overhead. The most expensive part of charging and uncharging is the page_cgroup bit spinlock, which is removed entirely after this series. Here are the top-10 profile entries of a stress test that reads a 128G sparse file on a freshly booted box, without even a dedicated cgroup (i.e. executing in the root memcg). Before: 15.36% cat [kernel.kallsyms] [k] copy_user_generic_string 13.31% cat [kernel.kallsyms] [k] memset 11.48% cat [kernel.kallsyms] [k] do_mpage_readpage 4.23% cat [kernel.kallsyms] [k] get_page_from_freelist 2.38% cat [kernel.kallsyms] [k] put_page 2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge 2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common 1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn After: 15.67% cat [kernel.kallsyms] [k] copy_user_generic_string 13.48% cat [kernel.kallsyms] [k] memset 11.42% cat [kernel.kallsyms] [k] do_mpage_readpage 3.98% cat [kernel.kallsyms] [k] get_page_from_freelist 2.46% cat [kernel.kallsyms] [k] put_page 2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn 1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk 1.30% cat [kernel.kallsyms] [k] kfree As you can see, the memcg footprint has shrunk quite a bit. text data bss dec hex filename 37970 9892 400 48262 bc86 mm/memcontrol.o.old 35239 9892 400 45531 b1db mm/memcontrol.o This patch (of 4): The memcg charge API charges pages before they are rmapped - i.e. have an actual "type" - and so every callsite needs its own set of charge and uncharge functions to know what type is being operated on. Worse, uncharge has to happen from a context that is still type-specific, rather than at the end of the page's lifetime with exclusive access, and so requires a lot of synchronization. Rewrite the charge API to provide a generic set of try_charge(), commit_charge() and cancel_charge() transaction operations, much like what's currently done for swap-in: mem_cgroup_try_charge() attempts to reserve a charge, reclaiming pages from the memcg if necessary. mem_cgroup_commit_charge() commits the page to the charge once it has a valid page->mapping and PageAnon() reliably tells the type. mem_cgroup_cancel_charge() aborts the transaction. This reduces the charge API and enables subsequent patches to drastically simplify uncharging. As pages need to be committed after rmap is established but before they are added to the LRU, page_add_new_anon_rmap() must stop doing LRU additions again. Revive lru_cache_add_active_or_unevictable(). [hughd@google.com: fix shmem_unuse] [hughd@google.com: Add comments on the private use of -EAGAIN] Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Hugh Dickins Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6f3254e..1d0af8a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, /* For mmu_notifiers */ const unsigned long mmun_start = addr; const unsigned long mmun_end = addr + PAGE_SIZE; + struct mem_cgroup *memcg; + + err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); + if (err) + return err; /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(page); @@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); + mem_cgroup_commit_charge(kpage, memcg, false); + lru_cache_add_active_or_unevictable(kpage, vma); if (!PageAnon(page)) { dec_mm_counter(mm, MM_FILEPAGES); @@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: + mem_cgroup_cancel_charge(kpage, memcg); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; @@ -315,18 +323,11 @@ retry: if (!new_page) goto put_old; - if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) - goto put_new; - __SetPageUptodate(new_page); copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = __replace_page(vma, vaddr, old_page, new_page); - if (ret) - mem_cgroup_uncharge_page(new_page); - -put_new: page_cache_release(new_page); put_old: put_page(old_page); -- cgit v1.1 From 747db954cab64c6b7a95b121b517165f34751898 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 8 Aug 2014 14:19:24 -0700 Subject: mm: memcontrol: use page lists for uncharge batching Pages are now uncharged at release time, and all sources of batched uncharges operate on lists of pages. Directly use those lists, and get rid of the per-task batching state. This also batches statistics accounting, in addition to the res counter charges, to reduce IRQ-disabling and re-enabling. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Cc: Naoya Horiguchi Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index fbd3497..f6f5086 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1346,10 +1346,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -#ifdef CONFIG_MEMCG - p->memcg_batch.do_batch = 0; - p->memcg_batch.memcg = NULL; -#endif #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; -- cgit v1.1 From 9a3f4d85d58cb4e02e226f9be946d54c33eb715b Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 8 Aug 2014 14:19:28 -0700 Subject: page-cgroup: get rid of NR_PCG_FLAGS It's not used anywhere today, so let's remove it. Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/bounds.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bounds.c b/kernel/bounds.c index 9fd4246..e1d1d195 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -18,7 +17,6 @@ void foo(void) /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); - DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); #ifdef CONFIG_SMP DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif -- cgit v1.1 From b86280aa48b67c8119ed8f6c6bebd8c0af13a269 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 8 Aug 2014 14:19:41 -0700 Subject: kernel/kallsyms.c: fix %pB when there's no symbol at the address __sprint_symbol() should restore original address when kallsyms_lookup() failed to find a symbol. It's reported when dumpstack shows an address in a dynamically allocated trampoline for ftrace. [ 1314.612287] [] dump_stack+0x45/0x56 [ 1314.612290] [] ? meminfo_proc_open+0x30/0x30 [ 1314.612293] [] kpatch_ftrace_handler+0x14/0xf0 [kpatch] [ 1314.612306] [] 0xffffffffa00160c3 You can see a difference in the hex address - c4 and c3. Fix it. Signed-off-by: Namhyung Kim Reported-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index cb0cf37..ae51670 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address, address += symbol_offset; name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) - return sprintf(buffer, "0x%lx", address); + return sprintf(buffer, "0x%lx", address - symbol_offset); if (name != buffer) strcpy(buffer, name); -- cgit v1.1 From 4878b14b43188ffeceecfc32295ed2a783b7aa7a Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 8 Aug 2014 14:19:48 -0700 Subject: kernel/test_kprobes.c: use current logging functions - Add pr_fmt - Coalesce formats - Use current pr_foo() functions instead of printk - Remove unnecessary "failed" display (already in log level). Signed-off-by: Fabian Frederick Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Masami Hiramatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/test_kprobes.c | 87 ++++++++++++++++++--------------------------------- 1 file changed, 31 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 12d6ebbf..0dbab6d 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -14,6 +14,8 @@ * the GNU General Public License for more details. */ +#define pr_fmt(fmt) "Kprobe smoke test: " fmt + #include #include #include @@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, { if (preh_val != (rand1 / div_factor)) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler\n"); + pr_err("incorrect value in post_handler\n"); } posth_val = preh_val + div_factor; } @@ -59,8 +60,7 @@ static int test_kprobe(void) ret = register_kprobe(&kp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobe returned %d\n", ret); + pr_err("register_kprobe returned %d\n", ret); return ret; } @@ -68,14 +68,12 @@ static int test_kprobe(void) unregister_kprobe(&kp); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); + pr_err("kprobe pre_handler not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); + pr_err("kprobe post_handler not called\n"); handler_errors++; } @@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, { if (preh_val != (rand1 / div_factor) + 1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler2\n"); + pr_err("incorrect value in post_handler2\n"); } posth_val = preh_val + div_factor; } @@ -120,8 +117,7 @@ static int test_kprobes(void) kp.flags = 0; ret = register_kprobes(kps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobes returned %d\n", ret); + pr_err("register_kprobes returned %d\n", ret); return ret; } @@ -130,14 +126,12 @@ static int test_kprobes(void) ret = target(rand1); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); + pr_err("kprobe pre_handler not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); + pr_err("kprobe post_handler not called\n"); handler_errors++; } @@ -146,14 +140,12 @@ static int test_kprobes(void) ret = target2(rand1); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler2 not called\n"); + pr_err("kprobe pre_handler2 not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler2 not called\n"); + pr_err("kprobe post_handler2 not called\n"); handler_errors++; } @@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value) { if (value != rand1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in jprobe handler\n"); + pr_err("incorrect value in jprobe handler\n"); } jph_val = rand1; @@ -186,16 +177,14 @@ static int test_jprobe(void) ret = register_jprobe(&jp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobe returned %d\n", ret); + pr_err("register_jprobe returned %d\n", ret); return ret; } ret = target(rand1); unregister_jprobe(&jp); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); + pr_err("jprobe handler not called\n"); handler_errors++; } @@ -217,24 +206,21 @@ static int test_jprobes(void) jp.kp.flags = 0; ret = register_jprobes(jps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobes returned %d\n", ret); + pr_err("register_jprobes returned %d\n", ret); return ret; } jph_val = 0; ret = target(rand1); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); + pr_err("jprobe handler not called\n"); handler_errors++; } jph_val = 0; ret = target2(rand1); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler2 not called\n"); + pr_err("jprobe handler2 not called\n"); handler_errors++; } unregister_jprobes(jps, 2); @@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) if (ret != (rand1 / div_factor)) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler\n"); + pr_err("incorrect value in kretprobe handler\n"); } if (krph_val == 0) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); + pr_err("call to kretprobe entry handler failed\n"); } krph_val = rand1; @@ -281,16 +265,14 @@ static int test_kretprobe(void) ret = register_kretprobe(&rp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); + pr_err("register_kretprobe returned %d\n", ret); return ret; } ret = target(rand1); unregister_kretprobe(&rp); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); + pr_err("kretprobe handler not called\n"); handler_errors++; } @@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) if (ret != (rand1 / div_factor) + 1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler2\n"); + pr_err("incorrect value in kretprobe handler2\n"); } if (krph_val == 0) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); + pr_err("call to kretprobe entry handler failed\n"); } krph_val = rand1; @@ -332,24 +312,21 @@ static int test_kretprobes(void) rp.kp.flags = 0; ret = register_kretprobes(rps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); + pr_err("register_kretprobe returned %d\n", ret); return ret; } krph_val = 0; ret = target(rand1); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); + pr_err("kretprobe handler not called\n"); handler_errors++; } krph_val = 0; ret = target2(rand1); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler2 not called\n"); + pr_err("kretprobe handler2 not called\n"); handler_errors++; } unregister_kretprobes(rps, 2); @@ -368,7 +345,7 @@ int init_test_probes(void) rand1 = prandom_u32(); } while (rand1 <= div_factor); - printk(KERN_INFO "Kprobe smoke test started\n"); + pr_info("started\n"); num_tests++; ret = test_kprobe(); if (ret < 0) @@ -402,13 +379,11 @@ int init_test_probes(void) #endif /* CONFIG_KRETPROBES */ if (errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " - "%d tests failed\n", errors, num_tests); + pr_err("BUG: %d out of %d tests failed\n", errors, num_tests); else if (handler_errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " - "running handlers\n", handler_errors); + pr_err("BUG: %d error(s) running handlers\n", handler_errors); else - printk(KERN_INFO "Kprobe smoke test passed successfully\n"); + pr_info("passed successfully\n"); return 0; } -- cgit v1.1 From a0be55dee71d437f7593c8c3673edd92962bafaf Mon Sep 17 00:00:00 2001 From: Ionut Alexa Date: Fri, 8 Aug 2014 14:21:18 -0700 Subject: kernel/exit.c: fix coding style warnings and errors Fixed coding style warnings and errors. Signed-off-by: Ionut Alexa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 88c6b3e..32c58f7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -59,7 +59,7 @@ #include #include -static void exit_mm(struct task_struct * tsk); +static void exit_mm(struct task_struct *tsk); static void __unhash_process(struct task_struct *p, bool group_dead) { @@ -151,7 +151,7 @@ static void __exit_signal(struct task_struct *tsk) spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk,TIF_SIGPENDING); + clear_tsk_thread_flag(tsk, TIF_SIGPENDING); if (group_dead) { flush_sigqueue(&sig->shared_pending); tty_kref_put(tty); @@ -168,7 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) } -void release_task(struct task_struct * p) +void release_task(struct task_struct *p) { struct task_struct *leader; int zap_leader; @@ -192,7 +192,8 @@ repeat: */ zap_leader = 0; leader = p->group_leader; - if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + if (leader != p && thread_group_empty(leader) + && leader->exit_state == EXIT_ZOMBIE) { /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -241,7 +242,8 @@ struct pid *session_of_pgrp(struct pid *pgrp) * * "I ask you, have you ever known what it is to be an orphan?" */ -static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) +static int will_become_orphaned_pgrp(struct pid *pgrp, + struct task_struct *ignored_task) { struct task_struct *p; @@ -294,9 +296,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) struct task_struct *ignored_task = tsk; if (!parent) - /* exit: our father is in a different pgrp than - * we are and we were the only connection outside. - */ + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than @@ -405,7 +407,7 @@ assign_new_owner: * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct * tsk) +static void exit_mm(struct task_struct *tsk) { struct mm_struct *mm = tsk->mm; struct core_state *core_state; @@ -425,6 +427,7 @@ static void exit_mm(struct task_struct * tsk) core_state = mm->core_state; if (core_state) { struct core_thread self; + up_read(&mm->mmap_sem); self.task = tsk; @@ -566,6 +569,7 @@ static void forget_original_parent(struct task_struct *father) list_for_each_entry_safe(p, n, &father->children, sibling) { struct task_struct *t = p; + do { t->real_parent = reaper; if (t->parent == father) { @@ -599,7 +603,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* * This does two things: * - * A. Make init inherit all the child processes + * A. Make init inherit all the child processes * B. Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) @@ -649,9 +653,8 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - printk(KERN_WARNING "%s (%d) used greatest stack depth: " - "%lu bytes left\n", - current->comm, task_pid_nr(current), free); + pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", + current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); @@ -692,8 +695,7 @@ void do_exit(long code) * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { - printk(KERN_ALERT - "Fixing recursive fault but reboot is needed!\n"); + pr_alert("Fixing recursive fault but reboot is needed!\n"); /* * We can do this unlocked here. The futex code uses * this flag just to verify whether the pi state @@ -717,9 +719,9 @@ void do_exit(long code) raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) - printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ @@ -837,7 +839,6 @@ void do_exit(long code) for (;;) cpu_relax(); /* For when BUG is null */ } - EXPORT_SYMBOL_GPL(do_exit); void complete_and_exit(struct completion *comp, long code) @@ -847,7 +848,6 @@ void complete_and_exit(struct completion *comp, long code) do_exit(code); } - EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) @@ -870,6 +870,7 @@ do_group_exit(int exit_code) exit_code = sig->group_exit_code; else if (!thread_group_empty(current)) { struct sighand_struct *const sighand = current->sighand; + spin_lock_irq(&sighand->siglock); if (signal_group_exit(sig)) /* Another thread got here before we took the lock. */ @@ -1034,9 +1035,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * as other threads in the parent group can be right * here reaping other children at the same time. * - * We use thread_group_cputime_adjusted() to get times for the thread - * group, which consolidates times for all threads in the - * group including the group leader. + * We use thread_group_cputime_adjusted() to get times for + * the thread group, which consolidates times for all threads + * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); @@ -1418,6 +1419,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); + if (ret) return ret; } @@ -1431,6 +1433,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); + if (ret) return ret; } -- cgit v1.1 From ccf94f1b4a8560ffdc221840535bae5e5a91a53c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 8 Aug 2014 14:21:22 -0700 Subject: proc: constify seq_operations proc_uid_seq_operations, proc_gid_seq_operations and proc_projid_seq_operations are only called in proc_id_map_open with seq_open as const struct seq_operations so we can constify the 3 structures and update proc_id_map_open prototype. text data bss dec hex filename 6817 404 1984 9205 23f5 kernel/user_namespace.o-before 6913 308 1984 9205 23f5 kernel/user_namespace.o-after Signed-off-by: Fabian Frederick Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user_namespace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index fcc0256..aa312b0 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -526,21 +526,21 @@ static void m_stop(struct seq_file *seq, void *v) return; } -struct seq_operations proc_uid_seq_operations = { +const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; -struct seq_operations proc_gid_seq_operations = { +const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; -struct seq_operations proc_projid_seq_operations = { +const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, -- cgit v1.1 From 41f727fde1fe40efeb4fef6fdce74ff794be5aeb Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 8 Aug 2014 14:21:56 -0700 Subject: fork/exec: cleanup mm initialization mm initialization on fork/exec is spread all over the place, which makes the code look inconsistent. We have mm_init(), which is supposed to init/nullify mm's internals, but it doesn't init all the fields it should: - on fork ->mmap,mm_rb,vmacache_seqnum,map_count,mm_cpumask,locked_vm are zeroed in dup_mmap(); - on fork ->pmd_huge_pte is zeroed in dup_mm(), immediately before calling mm_init(); - ->cpu_vm_mask_var ptr is initialized by mm_init_cpumask(), which is called before mm_init() on both fork and exec; - ->context is initialized by init_new_context(), which is called after mm_init() on both fork and exec; Let's consolidate all the initializations in mm_init() to make the code look cleaner. Signed-off-by: Vladimir Davydov Cc: Oleg Nesterov Cc: David Rientjes Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 47 ++++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f6f5086..418b52a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -374,12 +374,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - mm->locked_vm = 0; - mm->mmap = NULL; - mm->vmacache_seqnum = 0; - mm->map_count = 0; - cpumask_clear(mm_cpumask(mm)); - mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; @@ -538,17 +532,27 @@ static void mm_init_aio(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { + mm->mmap = NULL; + mm->mm_rb = RB_ROOT; + mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); + mm->map_count = 0; + mm->locked_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + mmu_notifier_mm_init(mm); clear_tlb_flush_pending(mm); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + mm->pmd_huge_pte = NULL; +#endif if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; @@ -558,11 +562,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->def_flags = 0; } - if (likely(!mm_alloc_pgd(mm))) { - mmu_notifier_mm_init(mm); - return mm; - } + if (mm_alloc_pgd(mm)) + goto fail_nopgd; + + if (init_new_context(p, mm)) + goto fail_nocontext; + return mm; + +fail_nocontext: + mm_free_pgd(mm); +fail_nopgd: free_mm(mm); return NULL; } @@ -596,7 +606,6 @@ struct mm_struct *mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - mm_init_cpumask(mm); return mm_init(mm, current); } @@ -828,17 +837,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); - mm_init_cpumask(mm); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - mm->pmd_huge_pte = NULL; -#endif if (!mm_init(mm, tsk)) goto fail_nomem; - if (init_new_context(tsk, mm)) - goto fail_nocontext; - dup_mm_exe_file(oldmm, mm); err = dup_mmap(mm, oldmm); @@ -860,15 +862,6 @@ free_pt: fail_nomem: return NULL; - -fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return NULL; } static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) -- cgit v1.1 From ce65cefa5debefc0e81d0a533bda467f0aa67350 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 8 Aug 2014 14:21:58 -0700 Subject: fork: reset mm->pinned_vm mm->pinned_vm counts pages of mm's address space that were permanently pinned in memory by increasing their reference counter. The counter was introduced by commit bc3e53f682d9 ("mm: distinguish between mlocked and pinned pages"), while before it locked_vm had been used for such pages. Obviously, we should reset the counter on fork if !CLONE_VM, just like we do with locked_vm, but currently we don't. Let's fix it. This patch will fix the contents of /proc/pid/status:VmPin. ib_umem_get[infiniband] and perf_mmap still check pinned_vm against RLIMIT_MEMLOCK. It's left from the times when pinned pages were accounted under locked_vm, but today it looks wrong. It isn't clear how we should deal with it. We still have some drivers accounting pinned pages under mm->locked_vm - this is what commit bc3e53f682d9 was fighting against. It's infiniband/usnic and vfio. Signed-off-by: Vladimir Davydov Cc: Oleg Nesterov Cc: David Rientjes Cc: Christoph Lameter Cc: Roland Dreier Cc: Sean Hefty Cc: Hal Rosenstock Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 418b52a..5a547a5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -543,6 +543,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) atomic_long_set(&mm->nr_ptes, 0); mm->map_count = 0; mm->locked_vm = 0; + mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm_init_cpumask(mm); -- cgit v1.1 From 4f7d461433bb4a4deee61baefdac6cd1a1ecb546 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 8 Aug 2014 14:22:01 -0700 Subject: fork: copy mm's vm usage counters under mmap_sem If a forking process has a thread calling (un)mmap (silly but still), the child process may have some of its mm's vm usage counters (total_vm and friends) screwed up, because currently they are copied from oldmm w/o holding any locks (memcpy in dup_mm). This patch moves the counters initialization to dup_mmap() to be called under oldmm->mmap_sem, which eliminates any possibility of race. Signed-off-by: Vladimir Davydov Cc: Oleg Nesterov Cc: David Rientjes Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 5a547a5..aff84f8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -374,6 +374,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + mm->total_vm = oldmm->total_vm; + mm->shared_vm = oldmm->shared_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; -- cgit v1.1 From 33144e8429bd7fceacbb869a7f5061db42e13fe6 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 8 Aug 2014 14:22:03 -0700 Subject: kernel/fork.c: make mm_init_owner static It's only used in fork.c:mm_init(). Signed-off-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index aff84f8..86da59e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -535,6 +535,13 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + mm->owner = p; +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { mm->mmap = NULL; @@ -1139,13 +1146,6 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } -#ifdef CONFIG_MEMCG -void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ - mm->owner = p; -} -#endif /* CONFIG_MEMCG */ - /* * Initialize POSIX timer handling for a single task. */ -- cgit v1.1 From 834b18b23e1012e6c2987af703490bc60956d211 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 8 Aug 2014 14:22:20 -0700 Subject: kernel/gcov/fs.c: remove unnecessary null test before debugfs_remove This fixes checkpatch warning: WARNING: debugfs_remove(NULL) is safe this check is probably not required Signed-off-by: Fabian Frederick Cc: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/fs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 15ff01a..edf67c4 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -784,8 +784,7 @@ static __init int gcov_fs_init(void) err_remove: pr_err("init failed\n"); - if (root_node.dentry) - debugfs_remove(root_node.dentry); + debugfs_remove(root_node.dentry); return rc; } -- cgit v1.1 From 69361eef9056b0babb507798c2135ad1572f0ef7 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Fri, 8 Aug 2014 14:22:31 -0700 Subject: panic: add TAINT_SOFTLOCKUP This taint flag will be set if the system has ever entered a softlockup state. Similar to TAINT_WARN it is useful to know whether or not the system has been in a softlockup state when debugging. [akpm@linux-foundation.org: apply the taint before calling panic()] Signed-off-by: Josh Hunt Cc: Jason Baron Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 1 + kernel/watchdog.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 62e16ce..d09dc5c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -224,6 +224,7 @@ static const struct tnt tnts[] = { { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, { TAINT_OOT_MODULE, 'O', ' ' }, { TAINT_UNSIGNED_MODULE, 'E', ' ' }, + { TAINT_SOFTLOCKUP, 'L', ' ' }, }; /** diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 51b29e9..a8d6914 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -368,6 +368,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) smp_mb__after_atomic(); } + add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); __this_cpu_write(soft_watchdog_warn, true); -- cgit v1.1 From ab602f799159393143d567e5c04b936fec79d6bd Mon Sep 17 00:00:00 2001 From: Jack Miller Date: Fri, 8 Aug 2014 14:23:19 -0700 Subject: shm: make exit_shm work proportional to task activity This is small set of patches our team has had kicking around for a few versions internally that fixes tasks getting hung on shm_exit when there are many threads hammering it at once. Anton wrote a simple test to cause the issue: http://ozlabs.org/~anton/junkcode/bust_shm_exit.c Before applying this patchset, this test code will cause either hanging tracebacks or pthread out of memory errors. After this patchset, it will still produce output like: root@somehost:~# ./bust_shm_exit 1024 160 ... INFO: rcu_sched detected stalls on CPUs/tasks: {} (detected by 116, t=2111 jiffies, g=241, c=240, q=7113) INFO: Stall ended before state dump start ... But the task will continue to run along happily, so we consider this an improvement over hanging, even if it's a bit noisy. This patch (of 3): exit_shm obtains the ipc_ns shm rwsem for write and holds it while it walks every shared memory segment in the namespace. Thus the amount of work is related to the number of shm segments in the namespace not the number of segments that might need to be cleaned. In addition, this occurs after the task has been notified the thread has exited, so the number of tasks waiting for the ns shm rwsem can grow without bound until memory is exausted. Add a list to the task struct of all shmids allocated by this task. Init the list head in copy_process. Use the ns->rwsem for locking. Add segments after id is added, remove before removing from id. On unshare of NEW_IPCNS orphan any ids as if the task had exited, similar to handling of semaphore undo. I chose a define for the init sequence since its a simple list init, otherwise it would require a function call to avoid include loops between the semaphore code and the task struct. Converting the list_del to list_del_init for the unshare cases would remove the exit followed by init, but I left it blow up if not inited. Signed-off-by: Milton Miller Signed-off-by: Jack Miller Cc: Davidlohr Bueso Cc: Manfred Spraul Cc: Anton Blanchard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 86da59e..fa91243 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1362,6 +1362,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (retval) goto bad_fork_cleanup_policy; /* copy all the process information */ + shm_init_task(p); retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; @@ -1913,6 +1914,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ exit_sem(current); } + if (unshare_flags & CLONE_NEWIPC) { + /* Orphan segments in old ns (see sem above). */ + exit_shm(current); + shm_init_task(current); + } if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); -- cgit v1.1 From 934fc295b30ea8ce5d5e0ab9024a10fab9b6f200 Mon Sep 17 00:00:00 2001 From: Ionut Alexa Date: Fri, 8 Aug 2014 14:23:42 -0700 Subject: kernel/acct.c: fix coding style warnings and errors Signed-off-by: Ionut Alexa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index a1844f1..5179352 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -141,12 +141,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) if (acct->active) { if (act < 0) { acct->active = 0; - printk(KERN_INFO "Process accounting paused\n"); + pr_info("Process accounting paused\n"); } } else { if (act > 0) { acct->active = 1; - printk(KERN_INFO "Process accounting resumed\n"); + pr_info("Process accounting resumed\n"); } } @@ -261,6 +261,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (name) { struct filename *tmp = getname(name); + if (IS_ERR(tmp)) return PTR_ERR(tmp); error = acct_on(tmp); @@ -376,7 +377,7 @@ static comp_t encode_comp_t(unsigned long value) return exp; } -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* * encode an u64 into a comp2_t (24 bits) * @@ -389,7 +390,7 @@ static comp_t encode_comp_t(unsigned long value) #define MANTSIZE2 20 /* 20 bit mantissa. */ #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ -#define MAXEXP2 ((1 < 0){ + if (value == 0) + return 0; + while ((s64)value > 0) { value <<= 1; exp--; } @@ -486,16 +488,17 @@ static void do_acct_process(struct bsd_acct_struct *acct, run_time -= current->group_leader->start_time; /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac.ac_etime = encode_float(elapsed); #else ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? - (unsigned long) elapsed : (unsigned long) -1l); + (unsigned long) elapsed : (unsigned long) -1l); #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 { /* new enlarged etime field */ comp2_t etime = encode_comp2_t(elapsed); + ac.ac_etime_hi = etime >> 16; ac.ac_etime_lo = (u16) etime; } @@ -505,15 +508,15 @@ static void do_acct_process(struct bsd_acct_struct *acct, /* we really need to bite the bullet and change layout */ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); -#if ACCT_VERSION==2 +#if ACCT_VERSION == 2 ac.ac_ahz = AHZ; #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ ac.ac_uid16 = ac.ac_uid; ac.ac_gid16 = ac.ac_gid; #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac.ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); @@ -574,6 +577,7 @@ void acct_collect(long exitcode, int group_dead) if (group_dead && current->mm) { struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); vma = current->mm->mmap; while (vma) { -- cgit v1.1 From 4bb5f5d9395bc112d93a134d8f5b05611eddc9c0 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Fri, 8 Aug 2014 14:25:25 -0700 Subject: mm: allow drivers to prevent new writable mappings This patch (of 6): The i_mmap_writable field counts existing writable mappings of an address_space. To allow drivers to prevent new writable mappings, make this counter signed and prevent new writable mappings if it is negative. This is modelled after i_writecount and DENYWRITE. This will be required by the shmem-sealing infrastructure to prevent any new writable mappings after the WRITE seal has been set. In case there exists a writable mapping, this operation will fail with EBUSY. Note that we rely on the fact that iff you already own a writable mapping, you can increase the counter without using the helpers. This is the same that we do for i_writecount. Signed-off-by: David Herrmann Acked-by: Hugh Dickins Cc: Michael Kerrisk Cc: Ryan Lortie Cc: Lennart Poettering Cc: Daniel Mack Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index fa91243..1380d8a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -429,7 +429,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) atomic_dec(&inode->i_writecount); mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ if (unlikely(tmp->vm_flags & VM_NONLINEAR)) -- cgit v1.1 From 9183df25fe7b194563db3fec6dc3202a5855839c Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Fri, 8 Aug 2014 14:25:29 -0700 Subject: shm: add memfd_create() syscall memfd_create() is similar to mmap(MAP_ANON), but returns a file-descriptor that you can pass to mmap(). It can support sealing and avoids any connection to user-visible mount-points. Thus, it's not subject to quotas on mounted file-systems, but can be used like malloc()'ed memory, but with a file-descriptor to it. memfd_create() returns the raw shmem file, so calls like ftruncate() can be used to modify the underlying inode. Also calls like fstat() will return proper information and mark the file as regular file. If you want sealing, you can specify MFD_ALLOW_SEALING. Otherwise, sealing is not supported (like on all other regular files). Compared to O_TMPFILE, it does not require a tmpfs mount-point and is not subject to a filesystem size limit. It is still properly accounted to memcg limits, though, and to the same overcommit or no-overcommit accounting as all user memory. Signed-off-by: David Herrmann Acked-by: Hugh Dickins Cc: Michael Kerrisk Cc: Ryan Lortie Cc: Lennart Poettering Cc: Daniel Mack Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 2904a21..1f79e37 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -197,6 +197,7 @@ cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); +cond_syscall(sys_memfd_create); /* performance counters: */ cond_syscall(sys_perf_event_open); -- cgit v1.1 From 8370edea81e321b8a976969753d6b2811e6d5ed6 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:38 -0700 Subject: bin2c: move bin2c in scripts/basic This patch series does not do kernel signature verification yet. I plan to post another patch series for that. Now distributions are already signing PE/COFF bzImage with PKCS7 signature I plan to parse and verify those signatures. Primary goal of this patchset is to prepare groundwork so that kernel image can be signed and signatures be verified during kexec load. This should help with two things. - It should allow kexec/kdump on secureboot enabled machines. - In general it can help even without secureboot. By being able to verify kernel image signature in kexec, it should help with avoiding module signing restrictions. Matthew Garret showed how to boot into a custom kernel, modify first kernel's memory and then jump back to old kernel and bypass any policy one wants to. This patch (of 15): Kexec wants to use bin2c and it wants to use it really early in the build process. See arch/x86/purgatory/ code in later patches. So move bin2c in scripts/basic so that it can be built very early and be usable by arch/x86/purgatory/ Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0026cf5..dc5c775 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -105,7 +105,7 @@ targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") + filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;") targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) -- cgit v1.1 From 7d3e2bca22feb1f4a624009ff6c15e6f724cb4e7 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:43 -0700 Subject: kexec: rename unusebale_pages to unusable_pages Let's use the more common "unusable". This patch was originally written and posted by Boris. I am including it in this patch series. Signed-off-by: Borislav Petkov Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 4b8f0c9..c7cc2a0 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -154,7 +154,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, INIT_LIST_HEAD(&image->dest_pages); /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unuseable_pages); + INIT_LIST_HEAD(&image->unusable_pages); /* Read in the segments */ image->nr_segments = nr_segments; @@ -609,7 +609,7 @@ static void kimage_free_extra_pages(struct kimage *image) kimage_free_page_list(&image->dest_pages); /* Walk through and free any unusable pages I have cached */ - kimage_free_page_list(&image->unuseable_pages); + kimage_free_page_list(&image->unusable_pages); } static void kimage_terminate(struct kimage *image) @@ -732,7 +732,7 @@ static struct page *kimage_alloc_page(struct kimage *image, /* If the page cannot be used file it away */ if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { - list_add(&page->lru, &image->unuseable_pages); + list_add(&page->lru, &image->unusable_pages); continue; } addr = page_to_pfn(page) << PAGE_SHIFT; -- cgit v1.1 From dabe78628dd886c4b71971d1d78f1cecc674b760 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:45 -0700 Subject: kexec: move segment verification code in a separate function Previously do_kimage_alloc() will allocate a kimage structure, copy segment list from user space and then do the segment list sanity verification. Break down this function in 3 parts. do_kimage_alloc_init() to do actual allocation and basic initialization of kimage structure. copy_user_segment_list() to copy segment list from user space and sanity_check_segment_list() to verify the sanity of segment list as passed by user space. In later patches, I need to only allocate kimage and not copy segment list from user space. So breaking down in smaller functions enables re-use of code at other places. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 182 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 100 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index c7cc2a0..062e556 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -125,45 +125,27 @@ static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long dest); -static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int copy_user_segment_list(struct kimage *image, + unsigned long nr_segments, + struct kexec_segment __user *segments) { + int ret; size_t segment_bytes; - struct kimage *image; - unsigned long i; - int result; - - /* Allocate a controlling structure */ - result = -ENOMEM; - image = kzalloc(sizeof(*image), GFP_KERNEL); - if (!image) - goto out; - - image->head = 0; - image->entry = &image->head; - image->last_entry = &image->head; - image->control_page = ~0; /* By default this does not apply */ - image->start = entry; - image->type = KEXEC_TYPE_DEFAULT; - - /* Initialize the list of control pages */ - INIT_LIST_HEAD(&image->control_pages); - - /* Initialize the list of destination pages */ - INIT_LIST_HEAD(&image->dest_pages); - - /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unusable_pages); /* Read in the segments */ image->nr_segments = nr_segments; segment_bytes = nr_segments * sizeof(*segments); - result = copy_from_user(image->segment, segments, segment_bytes); - if (result) { - result = -EFAULT; - goto out; - } + ret = copy_from_user(image->segment, segments, segment_bytes); + if (ret) + ret = -EFAULT; + + return ret; +} + +static int sanity_check_segment_list(struct kimage *image) +{ + int result, i; + unsigned long nr_segments = image->nr_segments; /* * Verify we have good destination addresses. The caller is @@ -185,9 +167,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - goto out; + return result; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - goto out; + return result; } /* Verify our destination addresses do not overlap. @@ -208,7 +190,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) - goto out; + return result; } } @@ -220,18 +202,61 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, result = -EINVAL; for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) - goto out; + return result; } - result = 0; -out: - if (result == 0) - *rimage = image; - else - kfree(image); + /* + * Verify we have good destination addresses. Normally + * the caller is responsible for making certain we don't + * attempt to load the new image into invalid or reserved + * areas of RAM. But crash kernels are preloaded into a + * reserved area of ram. We must ensure the addresses + * are in the reserved area otherwise preloading the + * kernel could corrupt things. + */ - return result; + if (image->type == KEXEC_TYPE_CRASH) { + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + /* Ensure we are within the crash kernel limits */ + if ((mstart < crashk_res.start) || + (mend > crashk_res.end)) + return result; + } + } + + return 0; +} + +static struct kimage *do_kimage_alloc_init(void) +{ + struct kimage *image; + + /* Allocate a controlling structure */ + image = kzalloc(sizeof(*image), GFP_KERNEL); + if (!image) + return NULL; + + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + image->control_page = ~0; /* By default this does not apply */ + image->type = KEXEC_TYPE_DEFAULT; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unusable pages */ + INIT_LIST_HEAD(&image->unusable_pages); + + return image; } static void kimage_free_page_list(struct list_head *list); @@ -244,10 +269,19 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, struct kimage *image; /* Allocate and initialize a controlling structure */ - image = NULL; - result = do_kimage_alloc(&image, entry, nr_segments, segments); + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->start = entry; + + result = copy_user_segment_list(image, nr_segments, segments); if (result) - goto out; + goto out_free_image; + + result = sanity_check_segment_list(image); + if (result) + goto out_free_image; /* * Find a location for the control code buffer, and add it @@ -259,22 +293,21 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_image; } image->swap_page = kimage_alloc_control_pages(image, 0); if (!image->swap_page) { pr_err("Could not allocate swap buffer\n"); - goto out_free; + goto out_free_control_pages; } *rimage = image; return 0; - -out_free: +out_free_control_pages: kimage_free_page_list(&image->control_pages); +out_free_image: kfree(image); -out: return result; } @@ -284,19 +317,17 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, { int result; struct kimage *image; - unsigned long i; - image = NULL; /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) { - result = -EADDRNOTAVAIL; - goto out; - } + if ((entry < crashk_res.start) || (entry > crashk_res.end)) + return -EADDRNOTAVAIL; /* Allocate and initialize a controlling structure */ - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) - goto out; + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->start = entry; /* Enable the special crash kernel control page * allocation policy. @@ -304,25 +335,13 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; - /* - * Verify we have good destination addresses. Normally - * the caller is responsible for making certain we don't - * attempt to load the new image into invalid or reserved - * areas of RAM. But crash kernels are preloaded into a - * reserved area of ram. We must ensure the addresses - * are in the reserved area otherwise preloading the - * kernel could corrupt things. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; + result = copy_user_segment_list(image, nr_segments, segments); + if (result) + goto out_free_image; - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || (mend > crashk_res.end)) - goto out_free; - } + result = sanity_check_segment_list(image); + if (result) + goto out_free_image; /* * Find a location for the control code buffer, and add @@ -334,15 +353,14 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_image; } *rimage = image; return 0; -out_free: +out_free_image: kfree(image); -out: return result; } -- cgit v1.1 From 255aedd90e3e804fb52e1a71636a3b22cf12f81b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:48 -0700 Subject: kexec: use common function for kimage_normal_alloc() and kimage_crash_alloc() kimage_normal_alloc() and kimage_crash_alloc() are doing lot of similar things and differ only little. So instead of having two separate functions create a common function kimage_alloc_init() and pass it the "flags" argument which tells whether it is normal kexec or kexec_on_panic. And this function should be able to deal with both the cases. This consolidation also helps later where we can use a common function kimage_file_alloc_init() to handle normal and crash cases for new file based kexec syscall. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 105 +++++++++++++++++++-------------------------------------- 1 file changed, 34 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 062e556..bfdda31 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -261,12 +261,20 @@ static struct kimage *do_kimage_alloc_init(void) static void kimage_free_page_list(struct list_head *list); -static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments, + unsigned long flags) { - int result; + int ret; struct kimage *image; + bool kexec_on_panic = flags & KEXEC_ON_CRASH; + + if (kexec_on_panic) { + /* Verify we have a valid entry point */ + if ((entry < crashk_res.start) || (entry > crashk_res.end)) + return -EADDRNOTAVAIL; + } /* Allocate and initialize a controlling structure */ image = do_kimage_alloc_init(); @@ -275,20 +283,26 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, image->start = entry; - result = copy_user_segment_list(image, nr_segments, segments); - if (result) + ret = copy_user_segment_list(image, nr_segments, segments); + if (ret) goto out_free_image; - result = sanity_check_segment_list(image); - if (result) + ret = sanity_check_segment_list(image); + if (ret) goto out_free_image; + /* Enable the special crash kernel control page allocation policy. */ + if (kexec_on_panic) { + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } + /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be * counted as destination pages. */ - result = -ENOMEM; + ret = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { @@ -296,10 +310,12 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, goto out_free_image; } - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - pr_err("Could not allocate swap buffer\n"); - goto out_free_control_pages; + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err("Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; @@ -308,60 +324,7 @@ out_free_control_pages: kimage_free_page_list(&image->control_pages); out_free_image: kfree(image); - return result; -} - -static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) -{ - int result; - struct kimage *image; - - /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) - return -EADDRNOTAVAIL; - - /* Allocate and initialize a controlling structure */ - image = do_kimage_alloc_init(); - if (!image) - return -ENOMEM; - - image->start = entry; - - /* Enable the special crash kernel control page - * allocation policy. - */ - image->control_page = crashk_res.start; - image->type = KEXEC_TYPE_CRASH; - - result = copy_user_segment_list(image, nr_segments, segments); - if (result) - goto out_free_image; - - result = sanity_check_segment_list(image); - if (result) - goto out_free_image; - - /* - * Find a location for the control code buffer, and add - * the vector of segments so that it's pages will also be - * counted as destination pages. - */ - result = -ENOMEM; - image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_PAGE_SIZE)); - if (!image->control_code_page) { - pr_err("Could not allocate control_code_buffer\n"); - goto out_free_image; - } - - *rimage = image; - return 0; - -out_free_image: - kfree(image); - return result; + return ret; } static int kimage_is_destination_range(struct kimage *image, @@ -1004,16 +967,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, /* Loading another kernel to reboot into */ if ((flags & KEXEC_ON_CRASH) == 0) - result = kimage_normal_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); /* Loading another kernel to switch to if this one crashes */ else if (flags & KEXEC_ON_CRASH) { /* Free any current crash dump kernel before * we corrupt it. */ kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_crash_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); crash_map_reserved_pages(); } if (result) -- cgit v1.1 From 8c86e70acead629aacb4afcd818add66bf6844d9 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:50 -0700 Subject: resource: provide new functions to walk through resources I have added two more functions to walk through resources. Currently walk_system_ram_range() deals with pfn and /proc/iomem can contain partial pages. By dealing in pfn, callback function loses the info that last page of a memory range is a partial page and not the full page. So I implemented walk_system_ram_res() which returns u64 values to callback functions and now it properly return start and end address. walk_system_ram_range() uses find_next_system_ram() to find the next ram resource. This in turn only travels through siblings of top level child and does not travers through all the nodes of the resoruce tree. I also need another function where I can walk through all the resources, for example figure out where "GART" aperture is. Figure out where ACPI memory is. So I wrote another function walk_iomem_res() which walks through all /proc/iomem resources and returns matches as asked by caller. Caller can specify "name" of resource, start and end and flags. Got rid of find_next_system_ram_res() and instead implemented more generic find_next_iomem_res() which can be used to traverse top level children only based on an argument. Signed-off-by: Vivek Goyal Cc: Yinghai Lu Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 92 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 3c2237a..da14b8d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock); static struct resource *bootmem_resource_free; static DEFINE_SPINLOCK(bootmem_resource_lock); -static void *r_next(struct seq_file *m, void *v, loff_t *pos) +static struct resource *next_resource(struct resource *p, bool sibling_only) { - struct resource *p = v; - (*pos)++; + /* Caller wants to traverse through siblings only */ + if (sibling_only) + return p->sibling; + if (p->child) return p->child; while (!p->sibling && p->parent) @@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) return p->sibling; } +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct resource *p = v; + (*pos)++; + return (void *)next_resource(p, false); +} + #ifdef CONFIG_PROC_FS enum { MAX_IORES_LEVEL = 5 }; @@ -322,16 +331,19 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* - * Finds the lowest memory reosurce exists within [res->start.res->end) + * Finds the lowest iomem reosurce exists with-in [res->start.res->end) * the caller must specify res->start, res->end, res->flags and "name". * If found, returns 0, res is overwritten, if not found, returns -1. + * This walks through whole tree and not just first level children + * until and unless first_level_children_only is true. */ -static int find_next_system_ram(struct resource *res, char *name) +static int find_next_iomem_res(struct resource *res, char *name, + bool first_level_children_only) { resource_size_t start, end; struct resource *p; + bool sibling_only = false; BUG_ON(!res); @@ -340,8 +352,14 @@ static int find_next_system_ram(struct resource *res, char *name) BUG_ON(start >= end); read_lock(&resource_lock); - for (p = iomem_resource.child; p ; p = p->sibling) { - /* system ram is just marked as IORESOURCE_MEM */ + + if (first_level_children_only) { + p = iomem_resource.child; + sibling_only = true; + } else + p = &iomem_resource; + + while ((p = next_resource(p, sibling_only))) { if (p->flags != res->flags) continue; if (name && strcmp(p->name, name)) @@ -353,6 +371,7 @@ static int find_next_system_ram(struct resource *res, char *name) if ((p->end >= start) && (p->start < end)) break; } + read_unlock(&resource_lock); if (!p) return -1; @@ -365,6 +384,70 @@ static int find_next_system_ram(struct resource *res, char *name) } /* + * Walks through iomem resources and calls func() with matching resource + * ranges. This walks through whole tree and not just first level children. + * All the memory ranges which overlap start,end and also match flags and + * name are valid candidates. + * + * @name: name of resource + * @flags: resource flags + * @start: start addr + * @end: end addr + */ +int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, + void *arg, int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = flags; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, name, false))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +/* + * This function calls callback against all memory range of "System RAM" + * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. + * Now, this function is only for "System RAM". This function deals with + * full ranges and not pfn. If resources are not pfn aligned, dealing + * with pfn can truncate ranges. + */ +int walk_system_ram_res(u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, "System RAM", true))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) + +/* * This function calls callback against all memory range of "System RAM" * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. * Now, this function is only for "System RAM". @@ -382,7 +465,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (find_next_system_ram(&res, "System RAM") >= 0)) { + (find_next_iomem_res(&res, "System RAM", true) >= 0)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) -- cgit v1.1 From f0895685c7fd8c938c91a9d8a6f7c11f22df58d2 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:55 -0700 Subject: kexec: new syscall kexec_file_load() declaration This is the new syscall kexec_file_load() declaration/interface. I have reserved the syscall number only for x86_64 so far. Other architectures (including i386) can reserve syscall number when they enable the support for this new syscall. Signed-off-by: Vivek Goyal Cc: Michael Kerrisk Cc: Borislav Petkov Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 7 +++++++ kernel/sys_ni.c | 1 + 2 files changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index bfdda31..ec4386c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1058,6 +1058,13 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, } #endif +SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, + unsigned long, cmdline_len, const char __user *, cmdline_ptr, + unsigned long, flags) +{ + return -ENOSYS; +} + void crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 1f79e37..391d4dd 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -25,6 +25,7 @@ cond_syscall(sys_swapon); cond_syscall(sys_swapoff); cond_syscall(sys_kexec_load); cond_syscall(compat_sys_kexec_load); +cond_syscall(sys_kexec_file_load); cond_syscall(sys_init_module); cond_syscall(sys_finit_module); cond_syscall(sys_delete_module); -- cgit v1.1 From cb1052581e2bddd6096544f3f944f4e7fdad4c7f Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:57 -0700 Subject: kexec: implementation of new syscall kexec_file_load Previous patch provided the interface definition and this patch prvides implementation of new syscall. Previously segment list was prepared in user space. Now user space just passes kernel fd, initrd fd and command line and kernel will create a segment list internally. This patch contains generic part of the code. Actual segment preparation and loading is done by arch and image specific loader. Which comes in next patch. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 483 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 478 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index ec4386c..9b46219 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) "kexec: " fmt + #include #include #include @@ -327,6 +329,221 @@ out_free_image: return ret; } +static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) +{ + struct fd f = fdget(fd); + int ret; + struct kstat stat; + loff_t pos; + ssize_t bytes = 0; + + if (!f.file) + return -EBADF; + + ret = vfs_getattr(&f.file->f_path, &stat); + if (ret) + goto out; + + if (stat.size > INT_MAX) { + ret = -EFBIG; + goto out; + } + + /* Don't hand 0 to vmalloc, it whines. */ + if (stat.size == 0) { + ret = -EINVAL; + goto out; + } + + *buf = vmalloc(stat.size); + if (!*buf) { + ret = -ENOMEM; + goto out; + } + + pos = 0; + while (pos < stat.size) { + bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, + stat.size - pos); + if (bytes < 0) { + vfree(*buf); + ret = bytes; + goto out; + } + + if (bytes == 0) + break; + pos += bytes; + } + + if (pos != stat.size) { + ret = -EBADF; + vfree(*buf); + goto out; + } + + *buf_len = pos; +out: + fdput(f); + return ret; +} + +/* Architectures can provide this probe function */ +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -ENOEXEC; +} + +void * __weak arch_kexec_kernel_image_load(struct kimage *image) +{ + return ERR_PTR(-ENOEXEC); +} + +void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) +{ +} + +/* + * Free up memory used by kernel, initrd, and comand line. This is temporary + * memory allocation which is not needed any more after these buffers have + * been loaded into separate segments and have been copied elsewhere. + */ +static void kimage_file_post_load_cleanup(struct kimage *image) +{ + vfree(image->kernel_buf); + image->kernel_buf = NULL; + + vfree(image->initrd_buf); + image->initrd_buf = NULL; + + kfree(image->cmdline_buf); + image->cmdline_buf = NULL; + + /* See if architecture has anything to cleanup post load */ + arch_kimage_file_post_load_cleanup(image); +} + +/* + * In file mode list of segments is prepared by kernel. Copy relevant + * data from user space, do error checking, prepare segment list + */ +static int +kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, + const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned flags) +{ + int ret = 0; + void *ldata; + + ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, + &image->kernel_buf_len); + if (ret) + return ret; + + /* Call arch image probe handlers */ + ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, + image->kernel_buf_len); + + if (ret) + goto out; + + /* It is possible that there no initramfs is being loaded */ + if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { + ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, + &image->initrd_buf_len); + if (ret) + goto out; + } + + if (cmdline_len) { + image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); + if (!image->cmdline_buf) { + ret = -ENOMEM; + goto out; + } + + ret = copy_from_user(image->cmdline_buf, cmdline_ptr, + cmdline_len); + if (ret) { + ret = -EFAULT; + goto out; + } + + image->cmdline_buf_len = cmdline_len; + + /* command line should be a string with last byte null */ + if (image->cmdline_buf[cmdline_len - 1] != '\0') { + ret = -EINVAL; + goto out; + } + } + + /* Call arch image load handlers */ + ldata = arch_kexec_kernel_image_load(image); + + if (IS_ERR(ldata)) { + ret = PTR_ERR(ldata); + goto out; + } + + image->image_loader_data = ldata; +out: + /* In case of error, free up all allocated memory in this function */ + if (ret) + kimage_file_post_load_cleanup(image); + return ret; +} + +static int +kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, + int initrd_fd, const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned long flags) +{ + int ret; + struct kimage *image; + + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->file_mode = 1; + + ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, + cmdline_ptr, cmdline_len, flags); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_post_load_bufs; + + ret = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_PAGE_SIZE)); + if (!image->control_code_page) { + pr_err("Could not allocate control_code_buffer\n"); + goto out_free_post_load_bufs; + } + + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err(KERN_ERR "Could not allocate swap buffer\n"); + goto out_free_control_pages; + } + + *rimage = image; + return 0; +out_free_control_pages: + kimage_free_page_list(&image->control_pages); +out_free_post_load_bufs: + kimage_file_post_load_cleanup(image); + kfree(image->image_loader_data); +out_free_image: + kfree(image); + return ret; +} + static int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end) @@ -644,6 +861,16 @@ static void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + + kfree(image->image_loader_data); + + /* + * Free up any temporary buffers allocated. This might hit if + * error occurred much later after buffer allocation. + */ + if (image->file_mode) + kimage_file_post_load_cleanup(image); + kfree(image); } @@ -772,10 +999,14 @@ static int kimage_load_normal_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -807,7 +1038,11 @@ static int kimage_load_normal_segment(struct kimage *image, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); - result = copy_from_user(ptr, buf, uchunk); + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kunmap(page); if (result) { result = -EFAULT; @@ -815,7 +1050,10 @@ static int kimage_load_normal_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -1062,7 +1300,72 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, unsigned long, cmdline_len, const char __user *, cmdline_ptr, unsigned long, flags) { - return -ENOSYS; + int ret = 0, i; + struct kimage **dest_image, *image; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + return -EPERM; + + /* Make sure we have a legal set of flags */ + if (flags != (flags & KEXEC_FILE_FLAGS)) + return -EINVAL; + + image = NULL; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + + dest_image = &kexec_image; + if (flags & KEXEC_FILE_ON_CRASH) + dest_image = &kexec_crash_image; + + if (flags & KEXEC_FILE_UNLOAD) + goto exchange; + + /* + * In case of crash, new kernel gets loaded in reserved region. It is + * same memory where old crash kernel might be loaded. Free any + * current crash dump kernel before we corrupt it. + */ + if (flags & KEXEC_FILE_ON_CRASH) + kimage_free(xchg(&kexec_crash_image, NULL)); + + ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, + cmdline_len, flags); + if (ret) + goto out; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + for (i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", + i, ksegment->buf, ksegment->bufsz, ksegment->mem, + ksegment->memsz); + + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* + * Free up any temporary buffers allocated which are not needed + * after image has been loaded + */ + kimage_file_post_load_cleanup(image); +exchange: + image = xchg(dest_image, image); +out: + mutex_unlock(&kexec_mutex); + kimage_free(image); + return ret; } void crash_kexec(struct pt_regs *regs) @@ -1620,6 +1923,176 @@ static int __init crash_save_vmcoreinfo_init(void) subsys_initcall(crash_save_vmcoreinfo_init); +static int __kexec_add_segment(struct kimage *image, char *buf, + unsigned long bufsz, unsigned long mem, + unsigned long memsz) +{ + struct kexec_segment *ksegment; + + ksegment = &image->segment[image->nr_segments]; + ksegment->kbuf = buf; + ksegment->bufsz = bufsz; + ksegment->mem = mem; + ksegment->memsz = memsz; + image->nr_segments++; + + return 0; +} + +static int locate_mem_hole_top_down(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_end = min(end, kbuf->buf_max); + temp_start = temp_end - kbuf->memsz; + + do { + /* align down start */ + temp_start = temp_start & (~(kbuf->buf_align - 1)); + + if (temp_start < start || temp_start < kbuf->buf_min) + return 0; + + temp_end = temp_start + kbuf->memsz - 1; + + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start - PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, + kbuf->memsz); + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_start = max(start, kbuf->buf_min); + + do { + temp_start = ALIGN(temp_start, kbuf->buf_align); + temp_end = temp_start + kbuf->memsz - 1; + + if (temp_end > end || temp_end > kbuf->buf_max) + return 0; + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start + PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, + kbuf->memsz); + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_callback(u64 start, u64 end, void *arg) +{ + struct kexec_buf *kbuf = (struct kexec_buf *)arg; + unsigned long sz = end - start + 1; + + /* Returning 0 will take to next memory range */ + if (sz < kbuf->memsz) + return 0; + + if (end < kbuf->buf_min || start > kbuf->buf_max) + return 0; + + /* + * Allocate memory top down with-in ram range. Otherwise bottom up + * allocation. + */ + if (kbuf->top_down) + return locate_mem_hole_top_down(start, end, kbuf); + return locate_mem_hole_bottom_up(start, end, kbuf); +} + +/* + * Helper function for placing a buffer in a kexec segment. This assumes + * that kexec_mutex is held. + */ +int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, + unsigned long memsz, unsigned long buf_align, + unsigned long buf_min, unsigned long buf_max, + bool top_down, unsigned long *load_addr) +{ + + struct kexec_segment *ksegment; + struct kexec_buf buf, *kbuf; + int ret; + + /* Currently adding segment this way is allowed only in file mode */ + if (!image->file_mode) + return -EINVAL; + + if (image->nr_segments >= KEXEC_SEGMENT_MAX) + return -EINVAL; + + /* + * Make sure we are not trying to add buffer after allocating + * control pages. All segments need to be placed first before + * any control pages are allocated. As control page allocation + * logic goes through list of segments to make sure there are + * no destination overlaps. + */ + if (!list_empty(&image->control_pages)) { + WARN_ON(1); + return -EINVAL; + } + + memset(&buf, 0, sizeof(struct kexec_buf)); + kbuf = &buf; + kbuf->image = image; + kbuf->buffer = buffer; + kbuf->bufsz = bufsz; + + kbuf->memsz = ALIGN(memsz, PAGE_SIZE); + kbuf->buf_align = max(buf_align, PAGE_SIZE); + kbuf->buf_min = buf_min; + kbuf->buf_max = buf_max; + kbuf->top_down = top_down; + + /* Walk the RAM ranges and allocate a suitable range for the buffer */ + ret = walk_system_ram_res(0, -1, kbuf, locate_mem_hole_callback); + if (ret != 1) { + /* A suitable memory range could not be found for buffer */ + return -EADDRNOTAVAIL; + } + + /* Found a suitable memory range */ + ksegment = &image->segment[image->nr_segments - 1]; + *load_addr = ksegment->mem; + return 0; +} + + /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. -- cgit v1.1 From 12db5562e0352986a265841638482b84f3a6899b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:26:04 -0700 Subject: kexec: load and relocate purgatory at kernel load time Load purgatory code in RAM and relocate it based on the location. Relocation code has been inspired by module relocation code and purgatory relocation code in kexec-tools. Also compute the checksums of loaded kexec segments and store them in purgatory. Arch independent code provides this functionality so that arch dependent bootloaders can make use of it. Helper functions are provided to get/set symbol values in purgatory which are used by bootloaders later to set things like stack and entry point of second kernel etc. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 544 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 543 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 9b46219..669e331 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -42,6 +42,9 @@ #include #include +#include +#include + /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -54,6 +57,15 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; +/* + * Declare these symbols weak so that if architecture provides a purgatory, + * these will be overridden. + */ +char __weak kexec_purgatory[0]; +size_t __weak kexec_purgatory_size = 0; + +static int kexec_calculate_store_digests(struct kimage *image); + /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { .name = "Crash kernel", @@ -404,6 +416,24 @@ void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) { } +/* Apply relocations of type RELA */ +int __weak +arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("RELA relocation unsupported.\n"); + return -ENOEXEC; +} + +/* Apply relocations of type REL */ +int __weak +arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("REL relocation unsupported.\n"); + return -ENOEXEC; +} + /* * Free up memory used by kernel, initrd, and comand line. This is temporary * memory allocation which is not needed any more after these buffers have @@ -411,6 +441,8 @@ void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) */ static void kimage_file_post_load_cleanup(struct kimage *image) { + struct purgatory_info *pi = &image->purgatory_info; + vfree(image->kernel_buf); image->kernel_buf = NULL; @@ -420,6 +452,12 @@ static void kimage_file_post_load_cleanup(struct kimage *image) kfree(image->cmdline_buf); image->cmdline_buf = NULL; + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; + + vfree(pi->sechdrs); + pi->sechdrs = NULL; + /* See if architecture has anything to cleanup post load */ arch_kimage_file_post_load_cleanup(image); } @@ -1105,7 +1143,7 @@ static int kimage_load_crash_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + buf += mchunk; mbytes -= mchunk; } out: @@ -1340,6 +1378,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; + ret = kexec_calculate_store_digests(image); + if (ret) + goto out; + for (i = 0; i < image->nr_segments; i++) { struct kexec_segment *ksegment; @@ -2092,6 +2134,506 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, return 0; } +/* Calculate and store the digest of segments */ +static int kexec_calculate_store_digests(struct kimage *image) +{ + struct crypto_shash *tfm; + struct shash_desc *desc; + int ret = 0, i, j, zero_buf_sz, sha_region_sz; + size_t desc_size, nullsz; + char *digest; + void *zero_buf; + struct kexec_sha_region *sha_regions; + struct purgatory_info *pi = &image->purgatory_info; + + zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); + zero_buf_sz = PAGE_SIZE; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out; + } + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + desc = kzalloc(desc_size, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto out_free_tfm; + } + + sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); + sha_regions = vzalloc(sha_region_sz); + if (!sha_regions) + goto out_free_desc; + + desc->tfm = tfm; + desc->flags = 0; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto out_free_sha_regions; + + digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!digest) { + ret = -ENOMEM; + goto out_free_sha_regions; + } + + for (j = i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + /* + * Skip purgatory as it will be modified once we put digest + * info in purgatory. + */ + if (ksegment->kbuf == pi->purgatory_buf) + continue; + + ret = crypto_shash_update(desc, ksegment->kbuf, + ksegment->bufsz); + if (ret) + break; + + /* + * Assume rest of the buffer is filled with zero and + * update digest accordingly. + */ + nullsz = ksegment->memsz - ksegment->bufsz; + while (nullsz) { + unsigned long bytes = nullsz; + + if (bytes > zero_buf_sz) + bytes = zero_buf_sz; + ret = crypto_shash_update(desc, zero_buf, bytes); + if (ret) + break; + nullsz -= bytes; + } + + if (ret) + break; + + sha_regions[j].start = ksegment->mem; + sha_regions[j].len = ksegment->memsz; + j++; + } + + if (!ret) { + ret = crypto_shash_final(desc, digest); + if (ret) + goto out_free_digest; + ret = kexec_purgatory_get_set_symbol(image, "sha_regions", + sha_regions, sha_region_sz, 0); + if (ret) + goto out_free_digest; + + ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); + if (ret) + goto out_free_digest; + } + +out_free_digest: + kfree(digest); +out_free_sha_regions: + vfree(sha_regions); +out_free_desc: + kfree(desc); +out_free_tfm: + kfree(tfm); +out: + return ret; +} + +/* Actually load purgatory. Lot of code taken from kexec-tools */ +static int __kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down) +{ + struct purgatory_info *pi = &image->purgatory_info; + unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; + unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; + unsigned char *buf_addr, *src; + int i, ret = 0, entry_sidx = -1; + const Elf_Shdr *sechdrs_c; + Elf_Shdr *sechdrs = NULL; + void *purgatory_buf = NULL; + + /* + * sechdrs_c points to section headers in purgatory and are read + * only. No modifications allowed. + */ + sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; + + /* + * We can not modify sechdrs_c[] and its fields. It is read only. + * Copy it over to a local copy where one can store some temporary + * data and free it at the end. We need to modify ->sh_addr and + * ->sh_offset fields to keep track of permanent and temporary + * locations of sections. + */ + sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + if (!sechdrs) + return -ENOMEM; + + memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + + /* + * We seem to have multiple copies of sections. First copy is which + * is embedded in kernel in read only section. Some of these sections + * will be copied to a temporary buffer and relocated. And these + * sections will finally be copied to their final destination at + * segment load time. + * + * Use ->sh_offset to reflect section address in memory. It will + * point to original read only copy if section is not allocatable. + * Otherwise it will point to temporary copy which will be relocated. + * + * Use ->sh_addr to contain final address of the section where it + * will go during execution time. + */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_NOBITS) + continue; + + sechdrs[i].sh_offset = (unsigned long)pi->ehdr + + sechdrs[i].sh_offset; + } + + /* + * Identify entry point section and make entry relative to section + * start. + */ + entry = pi->ehdr->e_entry; + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) + continue; + + /* Make entry section relative */ + if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && + ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > + pi->ehdr->e_entry)) { + entry_sidx = i; + entry -= sechdrs[i].sh_addr; + break; + } + } + + /* Determine how much memory is needed to load relocatable object. */ + buf_align = 1; + bss_align = 1; + buf_sz = 0; + bss_sz = 0; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + if (buf_align < align) + buf_align = align; + buf_sz = ALIGN(buf_sz, align); + buf_sz += sechdrs[i].sh_size; + } else { + /* bss section */ + if (bss_align < align) + bss_align = align; + bss_sz = ALIGN(bss_sz, align); + bss_sz += sechdrs[i].sh_size; + } + } + + /* Determine the bss padding required to align bss properly */ + bss_pad = 0; + if (buf_sz & (bss_align - 1)) + bss_pad = bss_align - (buf_sz & (bss_align - 1)); + + memsz = buf_sz + bss_pad + bss_sz; + + /* Allocate buffer for purgatory */ + purgatory_buf = vzalloc(buf_sz); + if (!purgatory_buf) { + ret = -ENOMEM; + goto out; + } + + if (buf_align < bss_align) + buf_align = bss_align; + + /* Add buffer to segment list */ + ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, + buf_align, min, max, top_down, + &pi->purgatory_load_addr); + if (ret) + goto out; + + /* Load SHF_ALLOC sections */ + buf_addr = purgatory_buf; + load_addr = curr_load_addr = pi->purgatory_load_addr; + bss_addr = load_addr + buf_sz + bss_pad; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + curr_load_addr = ALIGN(curr_load_addr, align); + offset = curr_load_addr - load_addr; + /* We already modifed ->sh_offset to keep src addr */ + src = (char *) sechdrs[i].sh_offset; + memcpy(buf_addr + offset, src, sechdrs[i].sh_size); + + /* Store load address and source address of section */ + sechdrs[i].sh_addr = curr_load_addr; + + /* + * This section got copied to temporary buffer. Update + * ->sh_offset accordingly. + */ + sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); + + /* Advance to the next address */ + curr_load_addr += sechdrs[i].sh_size; + } else { + bss_addr = ALIGN(bss_addr, align); + sechdrs[i].sh_addr = bss_addr; + bss_addr += sechdrs[i].sh_size; + } + } + + /* Update entry point based on load address of text section */ + if (entry_sidx >= 0) + entry += sechdrs[entry_sidx].sh_addr; + + /* Make kernel jump to purgatory after shutdown */ + image->start = entry; + + /* Used later to get/set symbol values */ + pi->sechdrs = sechdrs; + + /* + * Used later to identify which section is purgatory and skip it + * from checksumming. + */ + pi->purgatory_buf = purgatory_buf; + return ret; +out: + vfree(sechdrs); + vfree(purgatory_buf); + return ret; +} + +static int kexec_apply_relocations(struct kimage *image) +{ + int i, ret; + struct purgatory_info *pi = &image->purgatory_info; + Elf_Shdr *sechdrs = pi->sechdrs; + + /* Apply relocations */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + Elf_Shdr *section, *symtab; + + if (sechdrs[i].sh_type != SHT_RELA && + sechdrs[i].sh_type != SHT_REL) + continue; + + /* + * For section of type SHT_RELA/SHT_REL, + * ->sh_link contains section header index of associated + * symbol table. And ->sh_info contains section header + * index of section to which relocations apply. + */ + if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || + sechdrs[i].sh_link >= pi->ehdr->e_shnum) + return -ENOEXEC; + + section = &sechdrs[sechdrs[i].sh_info]; + symtab = &sechdrs[sechdrs[i].sh_link]; + + if (!(section->sh_flags & SHF_ALLOC)) + continue; + + /* + * symtab->sh_link contain section header index of associated + * string table. + */ + if (symtab->sh_link >= pi->ehdr->e_shnum) + /* Invalid section number? */ + continue; + + /* + * Respective archicture needs to provide support for applying + * relocations of type SHT_RELA/SHT_REL. + */ + if (sechdrs[i].sh_type == SHT_RELA) + ret = arch_kexec_apply_relocations_add(pi->ehdr, + sechdrs, i); + else if (sechdrs[i].sh_type == SHT_REL) + ret = arch_kexec_apply_relocations(pi->ehdr, + sechdrs, i); + if (ret) + return ret; + } + + return 0; +} + +/* Load relocatable purgatory object and relocate it appropriately */ +int kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down, + unsigned long *load_addr) +{ + struct purgatory_info *pi = &image->purgatory_info; + int ret; + + if (kexec_purgatory_size <= 0) + return -EINVAL; + + if (kexec_purgatory_size < sizeof(Elf_Ehdr)) + return -ENOEXEC; + + pi->ehdr = (Elf_Ehdr *)kexec_purgatory; + + if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 + || pi->ehdr->e_type != ET_REL + || !elf_check_arch(pi->ehdr) + || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) + return -ENOEXEC; + + if (pi->ehdr->e_shoff >= kexec_purgatory_size + || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > + kexec_purgatory_size - pi->ehdr->e_shoff)) + return -ENOEXEC; + + ret = __kexec_load_purgatory(image, min, max, top_down); + if (ret) + return ret; + + ret = kexec_apply_relocations(image); + if (ret) + goto out; + + *load_addr = pi->purgatory_load_addr; + return 0; +out: + vfree(pi->sechdrs); + vfree(pi->purgatory_buf); + return ret; +} + +static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) +{ + Elf_Sym *syms; + Elf_Shdr *sechdrs; + Elf_Ehdr *ehdr; + int i, k; + const char *strtab; + + if (!pi->sechdrs || !pi->ehdr) + return NULL; + + sechdrs = pi->sechdrs; + ehdr = pi->ehdr; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_SYMTAB) + continue; + + if (sechdrs[i].sh_link >= ehdr->e_shnum) + /* Invalid strtab section number */ + continue; + strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (Elf_Sym *)sechdrs[i].sh_offset; + + /* Go through symbols for a match */ + for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { + if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) + continue; + + if (strcmp(strtab + syms[k].st_name, name) != 0) + continue; + + if (syms[k].st_shndx == SHN_UNDEF || + syms[k].st_shndx >= ehdr->e_shnum) { + pr_debug("Symbol: %s has bad section index %d.\n", + name, syms[k].st_shndx); + return NULL; + } + + /* Found the symbol we are looking for */ + return &syms[k]; + } + } + + return NULL; +} + +void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) +{ + struct purgatory_info *pi = &image->purgatory_info; + Elf_Sym *sym; + Elf_Shdr *sechdr; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return ERR_PTR(-EINVAL); + + sechdr = &pi->sechdrs[sym->st_shndx]; + + /* + * Returns the address where symbol will finally be loaded after + * kexec_load_segment() + */ + return (void *)(sechdr->sh_addr + sym->st_value); +} + +/* + * Get or set value of a symbol. If "get_value" is true, symbol value is + * returned in buf otherwise symbol value is set based on value in buf. + */ +int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, + void *buf, unsigned int size, bool get_value) +{ + Elf_Sym *sym; + Elf_Shdr *sechdrs; + struct purgatory_info *pi = &image->purgatory_info; + char *sym_buf; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return -EINVAL; + + if (sym->st_size != size) { + pr_err("symbol %s size mismatch: expected %lu actual %u\n", + name, (unsigned long)sym->st_size, size); + return -EINVAL; + } + + sechdrs = pi->sechdrs; + + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + pr_err("symbol %s is in a bss section. Cannot %s\n", name, + get_value ? "get" : "set"); + return -EINVAL; + } + + sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + + sym->st_value; + + if (get_value) + memcpy((void *)buf, sym_buf, size); + else + memcpy((void *)sym_buf, buf, size); + + return 0; +} /* * Move into place and start executing a preloaded standalone -- cgit v1.1 From 27f48d3e633be23656a097baa3be336e04a82d84 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:26:06 -0700 Subject: kexec-bzImage64: support for loading bzImage using 64bit entry This is loader specific code which can load bzImage and set it up for 64bit entry. This does not take care of 32bit entry or real mode entry. 32bit mode entry can be implemented if somebody needs it. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 669e331..0926f2a 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -460,6 +460,14 @@ static void kimage_file_post_load_cleanup(struct kimage *image) /* See if architecture has anything to cleanup post load */ arch_kimage_file_post_load_cleanup(image); + + /* + * Above call should have called into bootloader to free up + * any data stored in kimage->image_loader_data. It should + * be ok now to free it up. + */ + kfree(image->image_loader_data); + image->image_loader_data = NULL; } /* @@ -576,7 +584,6 @@ out_free_control_pages: kimage_free_page_list(&image->control_pages); out_free_post_load_bufs: kimage_file_post_load_cleanup(image); - kfree(image->image_loader_data); out_free_image: kfree(image); return ret; @@ -900,8 +907,6 @@ static void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); - kfree(image->image_loader_data); - /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. -- cgit v1.1 From dd5f726076cc7639d9713b334c8c133f77c6757a Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:26:09 -0700 Subject: kexec: support for kexec on panic using new system call This patch adds support for loading a kexec on panic (kdump) kernel usning new system call. It prepares ELF headers for memory areas to be dumped and for saved cpu registers. Also prepares the memory map for second kernel and limits its boot to reserved areas only. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 46 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 0926f2a..f18c780 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -548,6 +548,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, { int ret; struct kimage *image; + bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; image = do_kimage_alloc_init(); if (!image) @@ -555,6 +556,12 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, image->file_mode = 1; + if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } + ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, cmdline_ptr, cmdline_len, flags); if (ret) @@ -572,10 +579,12 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, goto out_free_post_load_bufs; } - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - pr_err(KERN_ERR "Could not allocate swap buffer\n"); - goto out_free_control_pages; + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err(KERN_ERR "Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; @@ -1113,10 +1122,14 @@ static int kimage_load_crash_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -1139,7 +1152,12 @@ static int kimage_load_crash_segment(struct kimage *image, /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } - result = copy_from_user(ptr, buf, uchunk); + + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); kunmap(page); if (result) { @@ -1148,7 +1166,10 @@ static int kimage_load_crash_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -2127,7 +2148,14 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, kbuf->top_down = top_down; /* Walk the RAM ranges and allocate a suitable range for the buffer */ - ret = walk_system_ram_res(0, -1, kbuf, locate_mem_hole_callback); + if (image->type == KEXEC_TYPE_CRASH) + ret = walk_iomem_res("Crash kernel", + IORESOURCE_MEM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, kbuf, + locate_mem_hole_callback); + else + ret = walk_system_ram_res(0, -1, kbuf, + locate_mem_hole_callback); if (ret != 1) { /* A suitable memory range could not be found for buffer */ return -EADDRNOTAVAIL; -- cgit v1.1 From 8e7d838103feac320baf9e68d73f954840ac1eea Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:26:13 -0700 Subject: kexec: verify the signature of signed PE bzImage This is the final piece of the puzzle of verifying kernel image signature during kexec_file_load() syscall. This patch calls into PE file routines to verify signature of bzImage. If signature are valid, kexec_file_load() succeeds otherwise it fails. Two new config options have been introduced. First one is CONFIG_KEXEC_VERIFY_SIG. This option enforces that kernel has to be validly signed otherwise kernel load will fail. If this option is not set, no signature verification will be done. Only exception will be when secureboot is enabled. In that case signature verification should be automatically enforced when secureboot is enabled. But that will happen when secureboot patches are merged. Second config option is CONFIG_KEXEC_BZIMAGE_VERIFY_SIG. This option enables signature verification support on bzImage. If this option is not set and previous one is set, kernel image loading will fail because kernel does not have support to verify signature of bzImage. I tested these patches with both "pesign" and "sbsign" signed bzImages. I used signing_key.priv key and signing_key.x509 cert for signing as generated during kernel build process (if module signing is enabled). Used following method to sign bzImage. pesign ====== - Convert DER format cert to PEM format cert openssl x509 -in signing_key.x509 -inform DER -out signing_key.x509.PEM -outform PEM - Generate a .p12 file from existing cert and private key file openssl pkcs12 -export -out kernel-key.p12 -inkey signing_key.priv -in signing_key.x509.PEM - Import .p12 file into pesign db pk12util -i /tmp/kernel-key.p12 -d /etc/pki/pesign - Sign bzImage pesign -i /boot/vmlinuz-3.16.0-rc3+ -o /boot/vmlinuz-3.16.0-rc3+.signed.pesign -c "Glacier signing key - Magrathea" -s sbsign ====== sbsign --key signing_key.priv --cert signing_key.x509.PEM --output /boot/vmlinuz-3.16.0-rc3+.signed.sbsign /boot/vmlinuz-3.16.0-rc3+ Patch details: Well all the hard work is done in previous patches. Now bzImage loader has just call into that code and verify whether bzImage signature are valid or not. Also create two config options. First one is CONFIG_KEXEC_VERIFY_SIG. This option enforces that kernel has to be validly signed otherwise kernel load will fail. If this option is not set, no signature verification will be done. Only exception will be when secureboot is enabled. In that case signature verification should be automatically enforced when secureboot is enabled. But that will happen when secureboot patches are merged. Second config option is CONFIG_KEXEC_BZIMAGE_VERIFY_SIG. This option enables signature verification support on bzImage. If this option is not set and previous one is set, kernel image loading will fail because kernel does not have support to verify signature of bzImage. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Cc: Matt Fleming Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index f18c780..0b49a0a 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -416,6 +416,12 @@ void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) { } +int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -EKEYREJECTED; +} + /* Apply relocations of type RELA */ int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, @@ -494,6 +500,15 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, if (ret) goto out; +#ifdef CONFIG_KEXEC_VERIFY_SIG + ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); + if (ret) { + pr_debug("kernel signature verification failed.\n"); + goto out; + } + pr_debug("kernel signature verification successful.\n"); +#endif /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, -- cgit v1.1 From 69f6a34bdeea4fec50bb90619bc9602973119572 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 10 Aug 2014 20:50:30 -0700 Subject: seccomp: Replace BUG(!spin_is_locked()) with assert_spin_lock Current upstream kernel hangs with mips and powerpc targets in uniprocessor mode if SECCOMP is configured. Bisect points to commit dbd952127d11 ("seccomp: introduce writer locking"). Turns out that code such as BUG_ON(!spin_is_locked(&list_lock)); can not be used in uniprocessor mode because spin_is_locked() always returns false in this configuration, and that assert_spin_locked() exists for that very purpose and must be used instead. Fixes: dbd952127d11 ("seccomp: introduce writer locking") Cc: Kees Cook Signed-off-by: Guenter Roeck Signed-off-by: Kees Cook --- kernel/fork.c | 2 +- kernel/seccomp.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1380d8a..0cf9cdb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1105,7 +1105,7 @@ static void copy_seccomp(struct task_struct *p) * needed because this new task is not yet running and cannot * be racing exec. */ - BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + assert_spin_locked(¤t->sighand->siglock); /* Ref-count the new filter user, and assign it. */ get_seccomp_filter(current); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 25b0043..44eb005 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -203,7 +203,7 @@ static u32 seccomp_run_filters(int syscall) static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) { - BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + assert_spin_locked(¤t->sighand->siglock); if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) return false; @@ -214,7 +214,7 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) static inline void seccomp_assign_mode(struct task_struct *task, unsigned long seccomp_mode) { - BUG_ON(!spin_is_locked(&task->sighand->siglock)); + assert_spin_locked(&task->sighand->siglock); task->seccomp.mode = seccomp_mode; /* @@ -253,7 +253,7 @@ static inline pid_t seccomp_can_sync_threads(void) struct task_struct *thread, *caller; BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); - BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + assert_spin_locked(¤t->sighand->siglock); /* Validate all threads being eligible for synchronization. */ caller = current; @@ -294,7 +294,7 @@ static inline void seccomp_sync_threads(void) struct task_struct *thread, *caller; BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); - BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + assert_spin_locked(¤t->sighand->siglock); /* Synchronize all threads. */ caller = current; @@ -464,7 +464,7 @@ static long seccomp_attach_filter(unsigned int flags, unsigned long total_insns; struct seccomp_filter *walker; - BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + assert_spin_locked(¤t->sighand->siglock); /* Validate resulting filter length. */ total_insns = filter->prog->len; -- cgit v1.1 From 14c4000a88afaaa2d0877cc86d42a74fde0f35e0 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Sat, 9 Aug 2014 11:15:30 +0530 Subject: printk: Add function to return log buffer address and size Platforms like IBM Power Systems supports service processor assisted dump. It provides interface to add memory region to be captured when system is crashed. During initialization/running we can add kernel memory region to be collected. Presently we don't have a way to get the log buffer base address and size. This patch adds support to return log buffer address and size. Signed-off-by: Vasant Hegde Signed-off-by: Benjamin Herrenschmidt Acked-by: Andrew Morton --- kernel/printk/printk.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index de1a6bb..e04c455 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -272,6 +272,18 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; +/* Return log buffer address */ +char *log_buf_addr_get(void) +{ + return log_buf; +} + +/* Return log buffer size */ +u32 log_buf_len_get(void) +{ + return log_buf_len; +} + /* human readable text of the record */ static char *log_text(const struct printk_log *msg) { -- cgit v1.1 From 0680eb1f485ba5aac2ee02c9f0622239c9a4b16c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 13 Aug 2014 12:47:14 -0700 Subject: timekeeping: Another fix to the VSYSCALL_OLD update_vsyscall Benjamin Herrenschmidt pointed out that I further missed modifying update_vsyscall after the wall_to_mono value was changed to a timespec64. This causes issues on powerpc32, which expects a 32bit timespec. This patch fixes the problem by properly converting from a timespec64 to a timespec before passing the value on to the arch-specific vsyscall logic. [ Thomas is currently on vacation, but reviewed it and wanted me to send this fix on to you directly. ] Cc: LKML Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Benjamin Herrenschmidt Reported-by: Benjamin Herrenschmidt Reviewed-by: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Linus Torvalds --- kernel/time/timekeeping.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f36b028..fb4a9c2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -338,10 +338,11 @@ EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); static inline void update_vsyscall(struct timekeeper *tk) { - struct timespec xt; + struct timespec xt, wm; xt = timespec64_to_timespec(tk_xtime(tk)); - update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult, + wm = timespec64_to_timespec(tk->wall_to_monotonic); + update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult, tk->tkr.cycle_last); } -- cgit v1.1 From ff7e0055bb5ddbbb320cdd8dfd3e18672bddd2ad Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 16 Aug 2014 04:13:37 +0930 Subject: module: Clean up ro/nx after early module load failures The commit 4982223e51e8 module: set nx before marking module MODULE_STATE_COMING. introduced a regression: if a module fails to parse its arguments or if mod_sysfs_setup fails, then the module's memory will be freed while still read-only. Anything that reuses that memory will crash as soon as it tries to write to it. Cc: stable@vger.kernel.org # v3.16 Cc: Rusty Russell Signed-off-by: Andy Lutomirski Signed-off-by: Rusty Russell --- kernel/module.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 6f69463..03214bd2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3304,6 +3304,11 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_lock(&module_mutex); module_bug_cleanup(mod); mutex_unlock(&module_mutex); + + /* we can't deallocate the module until we clear memory protection */ + unset_module_init_ro_nx(mod); + unset_module_core_ro_nx(mod); + ddebug_cleanup: dynamic_debug_remove(info->debug); synchronize_sched(); -- cgit v1.1