summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2013-08-14 17:58:56 +0200
committerIngo Molnar <mingo@kernel.org>2013-08-14 17:58:56 +0200
commit6f1d657668ac3041b65265d3653d7e9172a0d603 (patch)
tree6e837c683783708637cc4caf9de759166c7469b7
parentd4e4ab86bcba5a72779c43dc1459f71fea3d89c8 (diff)
parentd13508f9440e46dccac6a2dd48d51a73b2207482 (diff)
downloadop-kernel-dev-6f1d657668ac3041b65265d3653d7e9172a0d603.zip
op-kernel-dev-6f1d657668ac3041b65265d3653d7e9172a0d603.tar.gz
Merge branch 'timers/nohz-v3' of git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks into timers/nohz
Pull nohz improvements from Frederic Weisbecker: " It mostly contains fixes and full dynticks off-case optimizations. I believe that distros want to enable this feature so it seems important to optimize the case where the "nohz_full=" parameter is empty. ie: I'm trying to remove any performance regression that comes with NO_HZ_FULL=y when the feature is not used. This patchset improves the current situation a lot (off-case appears to be around 11% faster with hackbench, although I guess it may vary depending on the configuration but it should be significantly faster in any case) now there is still some work to do: I can still observe a remaining loss of 1.6% throughput seen with hackbench compared to CONFIG_NO_HZ_FULL=n. " Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/ia64/include/asm/Kbuild1
-rw-r--r--arch/m68k/include/asm/irqflags.h2
-rw-r--r--arch/powerpc/include/asm/Kbuild1
-rw-r--r--arch/s390/include/asm/cputime.h3
-rw-r--r--arch/s390/include/asm/vtime.h7
-rw-r--r--arch/s390/kernel/vtime.c1
-rw-r--r--include/asm-generic/vtime.h0
-rw-r--r--include/linux/context_tracking.h128
-rw-r--r--include/linux/context_tracking_state.h39
-rw-r--r--include/linux/hardirq.h117
-rw-r--r--include/linux/preempt_mask.h122
-rw-r--r--include/linux/tick.h45
-rw-r--r--include/linux/vtime.h74
-rw-r--r--include/trace/events/context_tracking.h58
-rw-r--r--init/Kconfig28
-rw-r--r--init/main.c2
-rw-r--r--kernel/context_tracking.c125
-rw-r--r--kernel/sched/core.c4
-rw-r--r--kernel/sched/cputime.c53
-rw-r--r--kernel/time/Kconfig1
-rw-r--r--kernel/time/sched_clock.c2
-rw-r--r--kernel/time/tick-sched.c59
22 files changed, 544 insertions, 328 deletions
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index 05b03ec..a3456f3 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += clkdev.h
generic-y += exec.h
generic-y += kvm_para.h
generic-y += trace_clock.h
+generic-y += vtime.h \ No newline at end of file
diff --git a/arch/m68k/include/asm/irqflags.h b/arch/m68k/include/asm/irqflags.h
index 7ef4115..4c62755 100644
--- a/arch/m68k/include/asm/irqflags.h
+++ b/arch/m68k/include/asm/irqflags.h
@@ -3,7 +3,7 @@
#include <linux/types.h>
#ifdef CONFIG_MMU
-#include <linux/hardirq.h>
+#include <linux/preempt_mask.h>
#endif
#include <linux/preempt.h>
#include <asm/thread_info.h>
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 650757c..704e6f1 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -2,3 +2,4 @@
generic-y += clkdev.h
generic-y += rwsem.h
generic-y += trace_clock.h
+generic-y += vtime.h \ No newline at end of file
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index d2ff4137..f65bd36 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -13,9 +13,6 @@
#include <asm/div64.h>
-#define __ARCH_HAS_VTIME_ACCOUNT
-#define __ARCH_HAS_VTIME_TASK_SWITCH
-
/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */
typedef unsigned long long __nocast cputime_t;
diff --git a/arch/s390/include/asm/vtime.h b/arch/s390/include/asm/vtime.h
new file mode 100644
index 0000000..af9896c
--- /dev/null
+++ b/arch/s390/include/asm/vtime.h
@@ -0,0 +1,7 @@
+#ifndef _S390_VTIME_H
+#define _S390_VTIME_H
+
+#define __ARCH_HAS_VTIME_ACCOUNT
+#define __ARCH_HAS_VTIME_TASK_SWITCH
+
+#endif /* _S390_VTIME_H */
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 9b9c1b7..abcfab5 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -19,6 +19,7 @@
#include <asm/irq_regs.h>
#include <asm/cputime.h>
#include <asm/vtimer.h>
+#include <asm/vtime.h>
#include <asm/irq.h>
#include "entry.h"
diff --git a/include/asm-generic/vtime.h b/include/asm-generic/vtime.h
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/include/asm-generic/vtime.h
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index fc09d7b..1581587 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -2,100 +2,110 @@
#define _LINUX_CONTEXT_TRACKING_H
#include <linux/sched.h>
-#include <linux/percpu.h>
#include <linux/vtime.h>
+#include <linux/context_tracking_state.h>
#include <asm/ptrace.h>
-struct context_tracking {
- /*
- * When active is false, probes are unset in order
- * to minimize overhead: TIF flags are cleared
- * and calls to user_enter/exit are ignored. This
- * may be further optimized using static keys.
- */
- bool active;
- enum ctx_state {
- IN_KERNEL = 0,
- IN_USER,
- } state;
-};
-
-static inline void __guest_enter(void)
-{
- /*
- * This is running in ioctl context so we can avoid
- * the call to vtime_account() with its unnecessary idle check.
- */
- vtime_account_system(current);
- current->flags |= PF_VCPU;
-}
-
-static inline void __guest_exit(void)
-{
- /*
- * This is running in ioctl context so we can avoid
- * the call to vtime_account() with its unnecessary idle check.
- */
- vtime_account_system(current);
- current->flags &= ~PF_VCPU;
-}
#ifdef CONFIG_CONTEXT_TRACKING
-DECLARE_PER_CPU(struct context_tracking, context_tracking);
+extern void context_tracking_cpu_set(int cpu);
-static inline bool context_tracking_in_user(void)
+extern void context_tracking_user_enter(void);
+extern void context_tracking_user_exit(void);
+extern void __context_tracking_task_switch(struct task_struct *prev,
+ struct task_struct *next);
+
+static inline void user_enter(void)
{
- return __this_cpu_read(context_tracking.state) == IN_USER;
-}
+ if (static_key_false(&context_tracking_enabled))
+ context_tracking_user_enter();
-static inline bool context_tracking_active(void)
+}
+static inline void user_exit(void)
{
- return __this_cpu_read(context_tracking.active);
+ if (static_key_false(&context_tracking_enabled))
+ context_tracking_user_exit();
}
-extern void user_enter(void);
-extern void user_exit(void);
-
-extern void guest_enter(void);
-extern void guest_exit(void);
-
static inline enum ctx_state exception_enter(void)
{
enum ctx_state prev_ctx;
+ if (!static_key_false(&context_tracking_enabled))
+ return 0;
+
prev_ctx = this_cpu_read(context_tracking.state);
- user_exit();
+ context_tracking_user_exit();
return prev_ctx;
}
static inline void exception_exit(enum ctx_state prev_ctx)
{
- if (prev_ctx == IN_USER)
- user_enter();
+ if (static_key_false(&context_tracking_enabled)) {
+ if (prev_ctx == IN_USER)
+ context_tracking_user_enter();
+ }
}
-extern void context_tracking_task_switch(struct task_struct *prev,
- struct task_struct *next);
+static inline void context_tracking_task_switch(struct task_struct *prev,
+ struct task_struct *next)
+{
+ if (static_key_false(&context_tracking_enabled))
+ __context_tracking_task_switch(prev, next);
+}
#else
-static inline bool context_tracking_in_user(void) { return false; }
static inline void user_enter(void) { }
static inline void user_exit(void) { }
+static inline enum ctx_state exception_enter(void) { return 0; }
+static inline void exception_exit(enum ctx_state prev_ctx) { }
+static inline void context_tracking_task_switch(struct task_struct *prev,
+ struct task_struct *next) { }
+#endif /* !CONFIG_CONTEXT_TRACKING */
+
+
+#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+extern void context_tracking_init(void);
+#else
+static inline void context_tracking_init(void) { }
+#endif /* CONFIG_CONTEXT_TRACKING_FORCE */
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
static inline void guest_enter(void)
{
- __guest_enter();
+ if (vtime_accounting_enabled())
+ vtime_guest_enter(current);
+ else
+ current->flags |= PF_VCPU;
}
static inline void guest_exit(void)
{
- __guest_exit();
+ if (vtime_accounting_enabled())
+ vtime_guest_exit(current);
+ else
+ current->flags &= ~PF_VCPU;
}
-static inline enum ctx_state exception_enter(void) { return 0; }
-static inline void exception_exit(enum ctx_state prev_ctx) { }
-static inline void context_tracking_task_switch(struct task_struct *prev,
- struct task_struct *next) { }
-#endif /* !CONFIG_CONTEXT_TRACKING */
+#else
+static inline void guest_enter(void)
+{
+ /*
+ * This is running in ioctl context so its safe
+ * to assume that it's the stime pending cputime
+ * to flush.
+ */
+ vtime_account_system(current);
+ current->flags |= PF_VCPU;
+}
+
+static inline void guest_exit(void)
+{
+ /* Flush the guest cputime we spent on the guest */
+ vtime_account_system(current);
+ current->flags &= ~PF_VCPU;
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
#endif
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
new file mode 100644
index 0000000..0f1979d
--- /dev/null
+++ b/include/linux/context_tracking_state.h
@@ -0,0 +1,39 @@
+#ifndef _LINUX_CONTEXT_TRACKING_STATE_H
+#define _LINUX_CONTEXT_TRACKING_STATE_H
+
+#include <linux/percpu.h>
+#include <linux/static_key.h>
+
+struct context_tracking {
+ /*
+ * When active is false, probes are unset in order
+ * to minimize overhead: TIF flags are cleared
+ * and calls to user_enter/exit are ignored. This
+ * may be further optimized using static keys.
+ */
+ bool active;
+ enum ctx_state {
+ IN_KERNEL = 0,
+ IN_USER,
+ } state;
+};
+
+#ifdef CONFIG_CONTEXT_TRACKING
+extern struct static_key context_tracking_enabled;
+DECLARE_PER_CPU(struct context_tracking, context_tracking);
+
+static inline bool context_tracking_in_user(void)
+{
+ return __this_cpu_read(context_tracking.state) == IN_USER;
+}
+
+static inline bool context_tracking_active(void)
+{
+ return __this_cpu_read(context_tracking.active);
+}
+#else
+static inline bool context_tracking_in_user(void) { return false; }
+static inline bool context_tracking_active(void) { return false; }
+#endif /* CONFIG_CONTEXT_TRACKING */
+
+#endif
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 05bcc09..ccfe17c 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -1,126 +1,11 @@
#ifndef LINUX_HARDIRQ_H
#define LINUX_HARDIRQ_H
-#include <linux/preempt.h>
+#include <linux/preempt_mask.h>
#include <linux/lockdep.h>
#include <linux/ftrace_irq.h>
#include <linux/vtime.h>
-#include <asm/hardirq.h>
-/*
- * We put the hardirq and softirq counter into the preemption
- * counter. The bitmask has the following meaning:
- *
- * - bits 0-7 are the preemption count (max preemption depth: 256)
- * - bits 8-15 are the softirq count (max # of softirqs: 256)
- *
- * The hardirq count can in theory reach the same as NR_IRQS.
- * In reality, the number of nested IRQS is limited to the stack
- * size as well. For archs with over 1000 IRQS it is not practical
- * to expect that they will all nest. We give a max of 10 bits for
- * hardirq nesting. An arch may choose to give less than 10 bits.
- * m68k expects it to be 8.
- *
- * - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024)
- * - bit 26 is the NMI_MASK
- * - bit 27 is the PREEMPT_ACTIVE flag
- *
- * PREEMPT_MASK: 0x000000ff
- * SOFTIRQ_MASK: 0x0000ff00
- * HARDIRQ_MASK: 0x03ff0000
- * NMI_MASK: 0x04000000
- */
-#define PREEMPT_BITS 8
-#define SOFTIRQ_BITS 8
-#define NMI_BITS 1
-
-#define MAX_HARDIRQ_BITS 10
-
-#ifndef HARDIRQ_BITS
-# define HARDIRQ_BITS MAX_HARDIRQ_BITS
-#endif
-
-#if HARDIRQ_BITS > MAX_HARDIRQ_BITS
-#error HARDIRQ_BITS too high!
-#endif
-
-#define PREEMPT_SHIFT 0
-#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
-#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
-#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS)
-
-#define __IRQ_MASK(x) ((1UL << (x))-1)
-
-#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
-#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
-#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
-#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT)
-
-#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
-#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
-#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
-#define NMI_OFFSET (1UL << NMI_SHIFT)
-
-#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
-
-#ifndef PREEMPT_ACTIVE
-#define PREEMPT_ACTIVE_BITS 1
-#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
-#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
-#endif
-
-#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
-#error PREEMPT_ACTIVE is too low!
-#endif
-
-#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
-#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
-#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
- | NMI_MASK))
-
-/*
- * Are we doing bottom half or hardware interrupt processing?
- * Are we in a softirq context? Interrupt context?
- * in_softirq - Are we currently processing softirq or have bh disabled?
- * in_serving_softirq - Are we currently processing softirq?
- */
-#define in_irq() (hardirq_count())
-#define in_softirq() (softirq_count())
-#define in_interrupt() (irq_count())
-#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
-
-/*
- * Are we in NMI context?
- */
-#define in_nmi() (preempt_count() & NMI_MASK)
-
-#if defined(CONFIG_PREEMPT_COUNT)
-# define PREEMPT_CHECK_OFFSET 1
-#else
-# define PREEMPT_CHECK_OFFSET 0
-#endif
-
-/*
- * Are we running in atomic context? WARNING: this macro cannot
- * always detect atomic context; in particular, it cannot know about
- * held spinlocks in non-preemptible kernels. Thus it should not be
- * used in the general case to determine whether sleeping is possible.
- * Do not use in_atomic() in driver code.
- */
-#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
-
-/*
- * Check whether we were atomic before we did preempt_disable():
- * (used by the scheduler, *after* releasing the kernel lock)
- */
-#define in_atomic_preempt_off() \
- ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
-
-#ifdef CONFIG_PREEMPT_COUNT
-# define preemptible() (preempt_count() == 0 && !irqs_disabled())
-#else
-# define preemptible() 0
-#endif
#if defined(CONFIG_SMP) || defined(CONFIG_GENERIC_HARDIRQS)
extern void synchronize_irq(unsigned int irq);
diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h
new file mode 100644
index 0000000..931bc61
--- /dev/null
+++ b/include/linux/preempt_mask.h
@@ -0,0 +1,122 @@
+#ifndef LINUX_PREEMPT_MASK_H
+#define LINUX_PREEMPT_MASK_H
+
+#include <linux/preempt.h>
+#include <asm/hardirq.h>
+
+/*
+ * We put the hardirq and softirq counter into the preemption
+ * counter. The bitmask has the following meaning:
+ *
+ * - bits 0-7 are the preemption count (max preemption depth: 256)
+ * - bits 8-15 are the softirq count (max # of softirqs: 256)
+ *
+ * The hardirq count can in theory reach the same as NR_IRQS.
+ * In reality, the number of nested IRQS is limited to the stack
+ * size as well. For archs with over 1000 IRQS it is not practical
+ * to expect that they will all nest. We give a max of 10 bits for
+ * hardirq nesting. An arch may choose to give less than 10 bits.
+ * m68k expects it to be 8.
+ *
+ * - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024)
+ * - bit 26 is the NMI_MASK
+ * - bit 27 is the PREEMPT_ACTIVE flag
+ *
+ * PREEMPT_MASK: 0x000000ff
+ * SOFTIRQ_MASK: 0x0000ff00
+ * HARDIRQ_MASK: 0x03ff0000
+ * NMI_MASK: 0x04000000
+ */
+#define PREEMPT_BITS 8
+#define SOFTIRQ_BITS 8
+#define NMI_BITS 1
+
+#define MAX_HARDIRQ_BITS 10
+
+#ifndef HARDIRQ_BITS
+# define HARDIRQ_BITS MAX_HARDIRQ_BITS
+#endif
+
+#if HARDIRQ_BITS > MAX_HARDIRQ_BITS
+#error HARDIRQ_BITS too high!
+#endif
+
+#define PREEMPT_SHIFT 0
+#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
+#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS)
+
+#define __IRQ_MASK(x) ((1UL << (x))-1)
+
+#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
+#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
+#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
+#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT)
+
+#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
+#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
+#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
+#define NMI_OFFSET (1UL << NMI_SHIFT)
+
+#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
+
+#ifndef PREEMPT_ACTIVE
+#define PREEMPT_ACTIVE_BITS 1
+#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
+#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
+#endif
+
+#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
+#error PREEMPT_ACTIVE is too low!
+#endif
+
+#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
+#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
+#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
+ | NMI_MASK))
+
+/*
+ * Are we doing bottom half or hardware interrupt processing?
+ * Are we in a softirq context? Interrupt context?
+ * in_softirq - Are we currently processing softirq or have bh disabled?
+ * in_serving_softirq - Are we currently processing softirq?
+ */
+#define in_irq() (hardirq_count())
+#define in_softirq() (softirq_count())
+#define in_interrupt() (irq_count())
+#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
+
+/*
+ * Are we in NMI context?
+ */
+#define in_nmi() (preempt_count() & NMI_MASK)
+
+#if defined(CONFIG_PREEMPT_COUNT)
+# define PREEMPT_CHECK_OFFSET 1
+#else
+# define PREEMPT_CHECK_OFFSET 0
+#endif
+
+/*
+ * Are we running in atomic context? WARNING: this macro cannot
+ * always detect atomic context; in particular, it cannot know about
+ * held spinlocks in non-preemptible kernels. Thus it should not be
+ * used in the general case to determine whether sleeping is possible.
+ * Do not use in_atomic() in driver code.
+ */
+#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
+
+/*
+ * Check whether we were atomic before we did preempt_disable():
+ * (used by the scheduler, *after* releasing the kernel lock)
+ */
+#define in_atomic_preempt_off() \
+ ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
+
+#ifdef CONFIG_PREEMPT_COUNT
+# define preemptible() (preempt_count() == 0 && !irqs_disabled())
+#else
+# define preemptible() 0
+#endif
+
+#endif /* LINUX_PREEMPT_MASK_H */
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 62bd8b7..5128d33 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -10,6 +10,8 @@
#include <linux/irqflags.h>
#include <linux/percpu.h>
#include <linux/hrtimer.h>
+#include <linux/context_tracking_state.h>
+#include <linux/cpumask.h>
#ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -158,20 +160,51 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
# endif /* !CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
+extern bool tick_nohz_full_running;
+extern cpumask_var_t tick_nohz_full_mask;
+
+static inline bool tick_nohz_full_enabled(void)
+{
+ if (!static_key_false(&context_tracking_enabled))
+ return false;
+
+ return tick_nohz_full_running;
+}
+
+static inline bool tick_nohz_full_cpu(int cpu)
+{
+ if (!tick_nohz_full_enabled())
+ return false;
+
+ return cpumask_test_cpu(cpu, tick_nohz_full_mask);
+}
+
extern void tick_nohz_init(void);
-extern int tick_nohz_full_cpu(int cpu);
-extern void tick_nohz_full_check(void);
+extern void __tick_nohz_full_check(void);
extern void tick_nohz_full_kick(void);
extern void tick_nohz_full_kick_all(void);
-extern void tick_nohz_task_switch(struct task_struct *tsk);
+extern void __tick_nohz_task_switch(struct task_struct *tsk);
#else
static inline void tick_nohz_init(void) { }
-static inline int tick_nohz_full_cpu(int cpu) { return 0; }
-static inline void tick_nohz_full_check(void) { }
+static inline bool tick_nohz_full_enabled(void) { return false; }
+static inline bool tick_nohz_full_cpu(int cpu) { return false; }
+static inline void __tick_nohz_full_check(void) { }
static inline void tick_nohz_full_kick(void) { }
static inline void tick_nohz_full_kick_all(void) { }
-static inline void tick_nohz_task_switch(struct task_struct *tsk) { }
+static inline void __tick_nohz_task_switch(struct task_struct *tsk) { }
#endif
+static inline void tick_nohz_full_check(void)
+{
+ if (tick_nohz_full_enabled())
+ __tick_nohz_full_check();
+}
+
+static inline void tick_nohz_task_switch(struct task_struct *tsk)
+{
+ if (tick_nohz_full_enabled())
+ __tick_nohz_task_switch(tsk);
+}
+
#endif
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index b1dd2db..f5b72b3 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -1,18 +1,68 @@
#ifndef _LINUX_KERNEL_VTIME_H
#define _LINUX_KERNEL_VTIME_H
+#include <linux/context_tracking_state.h>
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+#include <asm/vtime.h>
+#endif
+
+
struct task_struct;
+/*
+ * vtime_accounting_enabled() definitions/declarations
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+static inline bool vtime_accounting_enabled(void) { return true; }
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+static inline bool vtime_accounting_enabled(void)
+{
+ if (static_key_false(&context_tracking_enabled)) {
+ if (context_tracking_active())
+ return true;
+ }
+
+ return false;
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+static inline bool vtime_accounting_enabled(void) { return false; }
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+/*
+ * Common vtime APIs
+ */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+
+#ifdef __ARCH_HAS_VTIME_TASK_SWITCH
extern void vtime_task_switch(struct task_struct *prev);
+#else
+extern void vtime_common_task_switch(struct task_struct *prev);
+static inline void vtime_task_switch(struct task_struct *prev)
+{
+ if (vtime_accounting_enabled())
+ vtime_common_task_switch(prev);
+}
+#endif /* __ARCH_HAS_VTIME_TASK_SWITCH */
+
extern void vtime_account_system(struct task_struct *tsk);
extern void vtime_account_idle(struct task_struct *tsk);
extern void vtime_account_user(struct task_struct *tsk);
-extern void vtime_account_irq_enter(struct task_struct *tsk);
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-static inline bool vtime_accounting_enabled(void) { return true; }
-#endif
+#ifdef __ARCH_HAS_VTIME_ACCOUNT
+extern void vtime_account_irq_enter(struct task_struct *tsk);
+#else
+extern void vtime_common_account_irq_enter(struct task_struct *tsk);
+static inline void vtime_account_irq_enter(struct task_struct *tsk)
+{
+ if (vtime_accounting_enabled())
+ vtime_common_account_irq_enter(tsk);
+}
+#endif /* __ARCH_HAS_VTIME_ACCOUNT */
#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
@@ -20,14 +70,20 @@ static inline void vtime_task_switch(struct task_struct *prev) { }
static inline void vtime_account_system(struct task_struct *tsk) { }
static inline void vtime_account_user(struct task_struct *tsk) { }
static inline void vtime_account_irq_enter(struct task_struct *tsk) { }
-static inline bool vtime_accounting_enabled(void) { return false; }
-#endif
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern void arch_vtime_task_switch(struct task_struct *tsk);
-extern void vtime_account_irq_exit(struct task_struct *tsk);
-extern bool vtime_accounting_enabled(void);
+extern void vtime_gen_account_irq_exit(struct task_struct *tsk);
+
+static inline void vtime_account_irq_exit(struct task_struct *tsk)
+{
+ if (vtime_accounting_enabled())
+ vtime_gen_account_irq_exit(tsk);
+}
+
extern void vtime_user_enter(struct task_struct *tsk);
+
static inline void vtime_user_exit(struct task_struct *tsk)
{
vtime_account_user(tsk);
@@ -35,7 +91,7 @@ static inline void vtime_user_exit(struct task_struct *tsk)
extern void vtime_guest_enter(struct task_struct *tsk);
extern void vtime_guest_exit(struct task_struct *tsk);
extern void vtime_init_idle(struct task_struct *tsk, int cpu);
-#else
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */
static inline void vtime_account_irq_exit(struct task_struct *tsk)
{
/* On hard|softirq exit we always account to hard|softirq cputime */
diff --git a/include/trace/events/context_tracking.h b/include/trace/events/context_tracking.h
new file mode 100644
index 0000000..ce8007c
--- /dev/null
+++ b/include/trace/events/context_tracking.h
@@ -0,0 +1,58 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM context_tracking
+
+#if !defined(_TRACE_CONTEXT_TRACKING_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_CONTEXT_TRACKING_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(context_tracking_user,
+
+ TP_PROTO(int dummy),
+
+ TP_ARGS(dummy),
+
+ TP_STRUCT__entry(
+ __field( int, dummy )
+ ),
+
+ TP_fast_assign(
+ __entry->dummy = dummy;
+ ),
+
+ TP_printk("%s", "")
+);
+
+/**
+ * user_enter - called when the kernel resumes to userspace
+ * @dummy: dummy arg to make trace event macro happy
+ *
+ * This event occurs when the kernel resumes to userspace after
+ * an exception or a syscall.
+ */
+DEFINE_EVENT(context_tracking_user, user_enter,
+
+ TP_PROTO(int dummy),
+
+ TP_ARGS(dummy)
+);
+
+/**
+ * user_exit - called when userspace enters the kernel
+ * @dummy: dummy arg to make trace event macro happy
+ *
+ * This event occurs when userspace enters the kernel through
+ * an exception or a syscall.
+ */
+DEFINE_EVENT(context_tracking_user, user_exit,
+
+ TP_PROTO(int dummy),
+
+ TP_ARGS(dummy)
+);
+
+
+#endif /* _TRACE_CONTEXT_TRACKING_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/init/Kconfig b/init/Kconfig
index 247084b..ffbf5d7 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -527,13 +527,29 @@ config RCU_USER_QS
config CONTEXT_TRACKING_FORCE
bool "Force context tracking"
depends on CONTEXT_TRACKING
- default CONTEXT_TRACKING
+ default y if !NO_HZ_FULL
help
- Probe on user/kernel boundaries by default in order to
- test the features that rely on it such as userspace RCU extended
- quiescent states.
- This test is there for debugging until we have a real user like the
- full dynticks mode.
+ The major pre-requirement for full dynticks to work is to
+ support the context tracking subsystem. But there are also
+ other dependencies to provide in order to make the full
+ dynticks working.
+
+ This option stands for testing when an arch implements the
+ context tracking backend but doesn't yet fullfill all the
+ requirements to make the full dynticks feature working.
+ Without the full dynticks, there is no way to test the support
+ for context tracking and the subsystems that rely on it: RCU
+ userspace extended quiescent state and tickless cputime
+ accounting. This option copes with the absence of the full
+ dynticks subsystem by forcing the context tracking on all
+ CPUs in the system.
+
+ Say Y only if you're working on the developpement of an
+ architecture backend for the context tracking.
+
+ Say N otherwise, this option brings an overhead that you
+ don't want in production.
+
config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value"
diff --git a/init/main.c b/init/main.c
index d03d2ec..af310af 100644
--- a/init/main.c
+++ b/init/main.c
@@ -75,6 +75,7 @@
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/sched_clock.h>
+#include <linux/context_tracking.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -545,6 +546,7 @@ asmlinkage void __init start_kernel(void)
idr_init_cache();
rcu_init();
tick_nohz_init();
+ context_tracking_init();
radix_tree_init();
/* init some links before init_ISA_irqs() */
early_irq_init();
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 383f823..247091b 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -20,22 +20,33 @@
#include <linux/hardirq.h>
#include <linux/export.h>
-DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
-#ifdef CONFIG_CONTEXT_TRACKING_FORCE
- .active = true,
-#endif
-};
+#define CREATE_TRACE_POINTS
+#include <trace/events/context_tracking.h>
+
+struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL_GPL(context_tracking_enabled);
+
+DEFINE_PER_CPU(struct context_tracking, context_tracking);
+EXPORT_SYMBOL_GPL(context_tracking);
+
+void context_tracking_cpu_set(int cpu)
+{
+ if (!per_cpu(context_tracking.active, cpu)) {
+ per_cpu(context_tracking.active, cpu) = true;
+ static_key_slow_inc(&context_tracking_enabled);
+ }
+}
/**
- * user_enter - Inform the context tracking that the CPU is going to
- * enter userspace mode.
+ * context_tracking_user_enter - Inform the context tracking that the CPU is going to
+ * enter userspace mode.
*
* This function must be called right before we switch from the kernel
* to userspace, when it's guaranteed the remaining kernel instructions
* to execute won't use any RCU read side critical section because this
* function sets RCU in extended quiescent state.
*/
-void user_enter(void)
+void context_tracking_user_enter(void)
{
unsigned long flags;
@@ -54,17 +65,32 @@ void user_enter(void)
WARN_ON_ONCE(!current->mm);
local_irq_save(flags);
- if (__this_cpu_read(context_tracking.active) &&
- __this_cpu_read(context_tracking.state) != IN_USER) {
+ if ( __this_cpu_read(context_tracking.state) != IN_USER) {
+ if (__this_cpu_read(context_tracking.active)) {
+ trace_user_enter(0);
+ /*
+ * At this stage, only low level arch entry code remains and
+ * then we'll run in userspace. We can assume there won't be
+ * any RCU read-side critical section until the next call to
+ * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+ * on the tick.
+ */
+ vtime_user_enter(current);
+ rcu_user_enter();
+ }
/*
- * At this stage, only low level arch entry code remains and
- * then we'll run in userspace. We can assume there won't be
- * any RCU read-side critical section until the next call to
- * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
- * on the tick.
+ * Even if context tracking is disabled on this CPU, because it's outside
+ * the full dynticks mask for example, we still have to keep track of the
+ * context transitions and states to prevent inconsistency on those of
+ * other CPUs.
+ * If a task triggers an exception in userspace, sleep on the exception
+ * handler and then migrate to another CPU, that new CPU must know where
+ * the exception returns by the time we call exception_exit().
+ * This information can only be provided by the previous CPU when it called
+ * exception_enter().
+ * OTOH we can spare the calls to vtime and RCU when context_tracking.active
+ * is false because we know that CPU is not tickless.
*/
- vtime_user_enter(current);
- rcu_user_enter();
__this_cpu_write(context_tracking.state, IN_USER);
}
local_irq_restore(flags);
@@ -87,10 +113,9 @@ void user_enter(void)
*/
void __sched notrace preempt_schedule_context(void)
{
- struct thread_info *ti = current_thread_info();
enum ctx_state prev_ctx;
- if (likely(ti->preempt_count || irqs_disabled()))
+ if (likely(!preemptible()))
return;
/*
@@ -112,8 +137,8 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
#endif /* CONFIG_PREEMPT */
/**
- * user_exit - Inform the context tracking that the CPU is
- * exiting userspace mode and entering the kernel.
+ * context_tracking_user_exit - Inform the context tracking that the CPU is
+ * exiting userspace mode and entering the kernel.
*
* This function must be called after we entered the kernel from userspace
* before any use of RCU read side critical section. This potentially include
@@ -122,7 +147,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
* This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not.
*/
-void user_exit(void)
+void context_tracking_user_exit(void)
{
unsigned long flags;
@@ -131,38 +156,22 @@ void user_exit(void)
local_irq_save(flags);
if (__this_cpu_read(context_tracking.state) == IN_USER) {
- /*
- * We are going to run code that may use RCU. Inform
- * RCU core about that (ie: we may need the tick again).
- */
- rcu_user_exit();
- vtime_user_exit(current);
+ if (__this_cpu_read(context_tracking.active)) {
+ /*
+ * We are going to run code that may use RCU. Inform
+ * RCU core about that (ie: we may need the tick again).
+ */
+ rcu_user_exit();
+ vtime_user_exit(current);
+ trace_user_exit(0);
+ }
__this_cpu_write(context_tracking.state, IN_KERNEL);
}
local_irq_restore(flags);
}
-void guest_enter(void)
-{
- if (vtime_accounting_enabled())
- vtime_guest_enter(current);
- else
- __guest_enter();
-}
-EXPORT_SYMBOL_GPL(guest_enter);
-
-void guest_exit(void)
-{
- if (vtime_accounting_enabled())
- vtime_guest_exit(current);
- else
- __guest_exit();
-}
-EXPORT_SYMBOL_GPL(guest_exit);
-
-
/**
- * context_tracking_task_switch - context switch the syscall callbacks
+ * __context_tracking_task_switch - context switch the syscall callbacks
* @prev: the task that is being switched out
* @next: the task that is being switched in
*
@@ -174,11 +183,19 @@ EXPORT_SYMBOL_GPL(guest_exit);
* migrate to some CPU that doesn't do the context tracking. As such the TIF
* flag may not be desired there.
*/
-void context_tracking_task_switch(struct task_struct *prev,
- struct task_struct *next)
+void __context_tracking_task_switch(struct task_struct *prev,
+ struct task_struct *next)
{
- if (__this_cpu_read(context_tracking.active)) {
- clear_tsk_thread_flag(prev, TIF_NOHZ);
- set_tsk_thread_flag(next, TIF_NOHZ);
- }
+ clear_tsk_thread_flag(prev, TIF_NOHZ);
+ set_tsk_thread_flag(next, TIF_NOHZ);
}
+
+#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+void __init context_tracking_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ context_tracking_cpu_set(cpu);
+}
+#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7c32cb..3fb7ace 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2510,13 +2510,11 @@ void __sched schedule_preempt_disabled(void)
*/
asmlinkage void __sched notrace preempt_schedule(void)
{
- struct thread_info *ti = current_thread_info();
-
/*
* If there is a non-zero preempt_count or interrupts are disabled,
* we do not want to preempt the current task. Just return..
*/
- if (likely(ti->preempt_count || irqs_disabled()))
+ if (likely(!preemptible()))
return;
do {
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a7959e0..c1d7493 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
+void vtime_common_task_switch(struct task_struct *prev)
{
- if (!vtime_accounting_enabled())
- return;
-
if (is_idle_task(prev))
vtime_account_idle(prev);
else
@@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev)
* vtime_account().
*/
#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account_irq_enter(struct task_struct *tsk)
+void vtime_common_account_irq_enter(struct task_struct *tsk)
{
- if (!vtime_accounting_enabled())
- return;
-
if (!in_interrupt()) {
/*
* If we interrupted user, context_tracking_in_user()
@@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
}
vtime_account_system(tsk);
}
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
+EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
@@ -559,12 +553,6 @@ static void cputime_adjust(struct task_cputime *curr,
{
cputime_t rtime, stime, utime, total;
- if (vtime_accounting_enabled()) {
- *ut = curr->utime;
- *st = curr->stime;
- return;
- }
-
stime = curr->stime;
total = stime + curr->utime;
@@ -664,23 +652,17 @@ static void __vtime_account_system(struct task_struct *tsk)
void vtime_account_system(struct task_struct *tsk)
{
- if (!vtime_accounting_enabled())
- return;
-
write_seqlock(&tsk->vtime_seqlock);
__vtime_account_system(tsk);
write_sequnlock(&tsk->vtime_seqlock);
}
-void vtime_account_irq_exit(struct task_struct *tsk)
+void vtime_gen_account_irq_exit(struct task_struct *tsk)
{
- if (!vtime_accounting_enabled())
- return;
-
write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
if (context_tracking_in_user())
tsk->vtime_snap_whence = VTIME_USER;
- __vtime_account_system(tsk);
write_sequnlock(&tsk->vtime_seqlock);
}
@@ -688,12 +670,8 @@ void vtime_account_user(struct task_struct *tsk)
{
cputime_t delta_cpu;
- if (!vtime_accounting_enabled())
- return;
-
- delta_cpu = get_vtime_delta(tsk);
-
write_seqlock(&tsk->vtime_seqlock);
+ delta_cpu = get_vtime_delta(tsk);
tsk->vtime_snap_whence = VTIME_SYS;
account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
write_sequnlock(&tsk->vtime_seqlock);
@@ -701,22 +679,27 @@ void vtime_account_user(struct task_struct *tsk)
void vtime_user_enter(struct task_struct *tsk)
{
- if (!vtime_accounting_enabled())
- return;
-
write_seqlock(&tsk->vtime_seqlock);
- tsk->vtime_snap_whence = VTIME_USER;
__vtime_account_system(tsk);
+ tsk->vtime_snap_whence = VTIME_USER;
write_sequnlock(&tsk->vtime_seqlock);
}
void vtime_guest_enter(struct task_struct *tsk)
{
+ /*
+ * The flags must be updated under the lock with
+ * the vtime_snap flush and update.
+ * That enforces a right ordering and update sequence
+ * synchronization against the reader (task_gtime())
+ * that can thus safely catch up with a tickless delta.
+ */
write_seqlock(&tsk->vtime_seqlock);
__vtime_account_system(tsk);
current->flags |= PF_VCPU;
write_sequnlock(&tsk->vtime_seqlock);
}
+EXPORT_SYMBOL_GPL(vtime_guest_enter);
void vtime_guest_exit(struct task_struct *tsk)
{
@@ -725,6 +708,7 @@ void vtime_guest_exit(struct task_struct *tsk)
current->flags &= ~PF_VCPU;
write_sequnlock(&tsk->vtime_seqlock);
}
+EXPORT_SYMBOL_GPL(vtime_guest_exit);
void vtime_account_idle(struct task_struct *tsk)
{
@@ -733,11 +717,6 @@ void vtime_account_idle(struct task_struct *tsk)
account_idle_time(delta_cpu);
}
-bool vtime_accounting_enabled(void)
-{
- return context_tracking_active();
-}
-
void arch_vtime_task_switch(struct task_struct *prev)
{
write_seqlock(&prev->vtime_seqlock);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 70f27e8..747bbc7 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -105,7 +105,6 @@ config NO_HZ_FULL
select RCU_USER_QS
select RCU_NOCB_CPU
select VIRT_CPU_ACCOUNTING_GEN
- select CONTEXT_TRACKING_FORCE
select IRQ_WORK
help
Adaptively try to shutdown the tick whenever possible, even when
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index a326f27..0b479a6 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -121,7 +121,7 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
BUG_ON(bits > 32);
WARN_ON(!irqs_disabled());
read_sched_clock = read;
- sched_clock_mask = (1 << bits) - 1;
+ sched_clock_mask = (1ULL << bits) - 1;
cd.rate = rate;
/* calculate the mult/shift to convert counter ticks to ns. */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e77edc9..adea6fc3 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -23,6 +23,7 @@
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
#include <linux/perf_event.h>
+#include <linux/context_tracking.h>
#include <asm/irq_regs.h>
@@ -148,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
}
#ifdef CONFIG_NO_HZ_FULL
-static cpumask_var_t nohz_full_mask;
-bool have_nohz_full_mask;
+cpumask_var_t tick_nohz_full_mask;
+bool tick_nohz_full_running;
static bool can_stop_full_tick(void)
{
@@ -182,7 +183,8 @@ static bool can_stop_full_tick(void)
* Don't allow the user to think they can get
* full NO_HZ with this machine.
*/
- WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock");
+ WARN_ONCE(tick_nohz_full_running,
+ "NO_HZ FULL will not work with unstable sched clock");
return false;
}
#endif
@@ -196,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
* Re-evaluate the need for the tick on the current CPU
* and restart it if necessary.
*/
-void tick_nohz_full_check(void)
+void __tick_nohz_full_check(void)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
@@ -210,7 +212,7 @@ void tick_nohz_full_check(void)
static void nohz_full_kick_work_func(struct irq_work *work)
{
- tick_nohz_full_check();
+ __tick_nohz_full_check();
}
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
@@ -229,7 +231,7 @@ void tick_nohz_full_kick(void)
static void nohz_full_kick_ipi(void *info)
{
- tick_nohz_full_check();
+ __tick_nohz_full_check();
}
/*
@@ -238,11 +240,11 @@ static void nohz_full_kick_ipi(void *info)
*/
void tick_nohz_full_kick_all(void)
{
- if (!have_nohz_full_mask)
+ if (!tick_nohz_full_running)
return;
preempt_disable();
- smp_call_function_many(nohz_full_mask,
+ smp_call_function_many(tick_nohz_full_mask,
nohz_full_kick_ipi, NULL, false);
preempt_enable();
}
@@ -252,7 +254,7 @@ void tick_nohz_full_kick_all(void)
* It might need the tick due to per task/process properties:
* perf events, posix cpu timers, ...
*/
-void tick_nohz_task_switch(struct task_struct *tsk)
+void __tick_nohz_task_switch(struct task_struct *tsk)
{
unsigned long flags;
@@ -268,31 +270,23 @@ out:
local_irq_restore(flags);
}
-int tick_nohz_full_cpu(int cpu)
-{
- if (!have_nohz_full_mask)
- return 0;
-
- return cpumask_test_cpu(cpu, nohz_full_mask);
-}
-
/* Parse the boot-time nohz CPU list from the kernel parameters. */
static int __init tick_nohz_full_setup(char *str)
{
int cpu;
- alloc_bootmem_cpumask_var(&nohz_full_mask);
- if (cpulist_parse(str, nohz_full_mask) < 0) {
+ alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
+ if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
return 1;
}
cpu = smp_processor_id();
- if (cpumask_test_cpu(cpu, nohz_full_mask)) {
+ if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
- cpumask_clear_cpu(cpu, nohz_full_mask);
+ cpumask_clear_cpu(cpu, tick_nohz_full_mask);
}
- have_nohz_full_mask = true;
+ tick_nohz_full_running = true;
return 1;
}
@@ -310,7 +304,7 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
* If we handle the timekeeping duty for full dynticks CPUs,
* we can't safely shutdown that CPU.
*/
- if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
+ if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
return NOTIFY_BAD;
break;
}
@@ -329,14 +323,14 @@ static int tick_nohz_init_all(void)
int err = -1;
#ifdef CONFIG_NO_HZ_FULL_ALL
- if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) {
+ if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
return err;
}
err = 0;
- cpumask_setall(nohz_full_mask);
- cpumask_clear_cpu(smp_processor_id(), nohz_full_mask);
- have_nohz_full_mask = true;
+ cpumask_setall(tick_nohz_full_mask);
+ cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
+ tick_nohz_full_running = true;
#endif
return err;
}
@@ -345,17 +339,18 @@ void __init tick_nohz_init(void)
{
int cpu;
- if (!have_nohz_full_mask) {
+ if (!tick_nohz_full_running) {
if (tick_nohz_init_all() < 0)
return;
}
+ for_each_cpu(cpu, tick_nohz_full_mask)
+ context_tracking_cpu_set(cpu);
+
cpu_notifier(tick_nohz_cpu_down_callback, 0);
- cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
+ cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask);
pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
}
-#else
-#define have_nohz_full_mask (0)
#endif
/*
@@ -733,7 +728,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return false;
}
- if (have_nohz_full_mask) {
+ if (tick_nohz_full_enabled()) {
/*
* Keep the tick alive to guarantee timekeeping progression
* if there are full dynticks CPUs around
OpenPOWER on IntegriCloud