From 1e02cd40f15190b78fcc6b3f50c952fb4028e9a5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Mar 2016 15:39:24 +0100 Subject: perf/core: Fix the unthrottle logic Its possible to IOC_PERIOD while the event is throttled, this would re-start the event and the next tick would then try to unthrottle it, and find the event wasn't actually stopped anymore. This would tickle a WARN in the x86-pmu code which isn't expecting to start a !stopped event. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: dvyukov@google.com Cc: oleg@redhat.com Cc: panand@redhat.com Cc: sasha.levin@oracle.com Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160310143924.GR6356@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 712570d..d394773 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4210,6 +4210,14 @@ static void __perf_event_period(struct perf_event *event, active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { perf_pmu_disable(ctx->pmu); + /* + * We could be throttled; unthrottle now to avoid the tick + * trying to unthrottle while we already re-started the event. + */ + if (event->hw.interrupts == MAX_INTERRUPTS) { + event->hw.interrupts = 0; + perf_log_throttle(event, 1); + } event->pmu->stop(event, PERF_EF_UPDATE); } -- cgit v1.1 From 91a612eea9a316c464cc170ff8492ec09e7d1c69 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Mar 2016 15:17:35 +0100 Subject: perf/core: Fix dynamic interrupt throttle There were two problems with the dynamic interrupt throttle mechanism, both triggered by the same action. When you (or perf_fuzzer) write a huge value into /proc/sys/kernel/perf_event_max_sample_rate the computed perf_sample_allowed_ns becomes 0. This effectively disables the whole dynamic throttle. This is fixed by ensuring update_perf_cpu_limits() never sets the value to 0. However, we allow disabling of the dynamic throttle by writing 100 to /proc/sys/kernel/perf_cpu_time_max_percent. This will generate a warning in dmesg. The second problem is that by setting the max_sample_rate to a huge number, the adaptive process can take a few tries, since it halfs the limit each time. Change that to directly compute a new value based on the observed duration. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 87 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d394773..43e35fa 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -376,8 +376,11 @@ static void update_perf_cpu_limits(void) u64 tmp = perf_sample_period_ns; tmp *= sysctl_perf_cpu_time_max_percent; - do_div(tmp, 100); - ACCESS_ONCE(perf_sample_allowed_ns) = tmp; + tmp = div_u64(tmp, 100); + if (!tmp) + tmp = 1; + + WRITE_ONCE(perf_sample_allowed_ns, tmp); } static int perf_rotate_context(struct perf_cpu_context *cpuctx); @@ -409,7 +412,13 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, if (ret || !write) return ret; - update_perf_cpu_limits(); + if (sysctl_perf_cpu_time_max_percent == 100) { + printk(KERN_WARNING + "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); + WRITE_ONCE(perf_sample_allowed_ns, 0); + } else { + update_perf_cpu_limits(); + } return 0; } @@ -423,62 +432,68 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); +static u64 __report_avg; +static u64 __report_allowed; + static void perf_duration_warn(struct irq_work *w) { - u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); - u64 avg_local_sample_len; - u64 local_samples_len; - - local_samples_len = __this_cpu_read(running_sample_length); - avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; - printk_ratelimited(KERN_WARNING - "perf interrupt took too long (%lld > %lld), lowering " - "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns >> 1, - sysctl_perf_event_sample_rate); + "perf: interrupt took too long (%lld > %lld), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + __report_avg, __report_allowed, + sysctl_perf_event_sample_rate); } static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); void perf_sample_event_took(u64 sample_len_ns) { - u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); - u64 avg_local_sample_len; - u64 local_samples_len; + u64 max_len = READ_ONCE(perf_sample_allowed_ns); + u64 running_len; + u64 avg_len; + u32 max; - if (allowed_ns == 0) + if (max_len == 0) return; - /* decay the counter by 1 average sample */ - local_samples_len = __this_cpu_read(running_sample_length); - local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; - local_samples_len += sample_len_ns; - __this_cpu_write(running_sample_length, local_samples_len); + /* Decay the counter by 1 average sample. */ + running_len = __this_cpu_read(running_sample_length); + running_len -= running_len/NR_ACCUMULATED_SAMPLES; + running_len += sample_len_ns; + __this_cpu_write(running_sample_length, running_len); /* - * note: this will be biased artifically low until we have - * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us + * Note: this will be biased artifically low until we have + * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count. */ - avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; - - if (avg_local_sample_len <= allowed_ns) + avg_len = running_len/NR_ACCUMULATED_SAMPLES; + if (avg_len <= max_len) return; - if (max_samples_per_tick <= 1) - return; + __report_avg = avg_len; + __report_allowed = max_len; - max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2); - sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; - perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; + /* + * Compute a throttle threshold 25% below the current duration. + */ + avg_len += avg_len / 4; + max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; + if (avg_len < max) + max /= (u32)avg_len; + else + max = 1; - update_perf_cpu_limits(); + WRITE_ONCE(perf_sample_allowed_ns, avg_len); + WRITE_ONCE(max_samples_per_tick, max); + + sysctl_perf_event_sample_rate = max * HZ; + perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; if (!irq_work_queue(&perf_duration_work)) { - early_printk("perf interrupt took too long (%lld > %lld), lowering " + early_printk("perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns >> 1, + __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } } -- cgit v1.1 From 8184059e93c200757f5c0805dae0f14e880eab5d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Jan 2016 15:17:51 +0100 Subject: perf/core: Fix Undefined behaviour in rb_alloc() Sasha reported: [ 3494.030114] UBSAN: Undefined behaviour in kernel/events/ring_buffer.c:685:22 [ 3494.030647] shift exponent -1 is negative Andrey spotted that this is because: It happens if nr_pages = 0: rb->page_order = ilog2(nr_pages); Fix it by making both assignments conditional on nr_pages; since otherwise they should both be 0 anyway, and will be because of the kzalloc() used to allocate the structure. Reported-by: Sasha Levin Reported-by: Andrey Ryabinin Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/20160129141751.GA407@worktop Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 1faad2cf..c61f0cb 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -746,8 +746,10 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) rb->user_page = all_buf; rb->data_pages[0] = all_buf + PAGE_SIZE; - rb->page_order = ilog2(nr_pages); - rb->nr_pages = !!nr_pages; + if (nr_pages) { + rb->nr_pages = 1; + rb->page_order = ilog2(nr_pages); + } ring_buffer_init(rb, watermark, flags); -- cgit v1.1 From 1dcaac1ce07db672e2cb5b4ef9642990689bb2a1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 8 Mar 2016 17:56:05 +0100 Subject: perf/core: Document some hotplug bits Document some of the hotplug notifier usage. Requested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 43e35fa..de24fbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9449,10 +9449,29 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: + /* + * This must be done before the CPU comes alive, because the + * moment we can run tasks we can encounter (software) events. + * + * Specifically, someone can have inherited events on kthreadd + * or a pre-existing worker thread that gets re-bound. + */ perf_event_init_cpu(cpu); break; case CPU_DOWN_PREPARE: + /* + * This must be done before the CPU dies because after that an + * active event might want to IPI the CPU and that'll not work + * so great for dead CPUs. + * + * XXX smp_call_function_single() return -ENXIO without a warn + * so we could possibly deal with this. + * + * This is safe against new events arriving because + * sys_perf_event_open() serializes against hotplug using + * get_online_cpus(). + */ perf_event_exit_cpu(cpu); break; default: -- cgit v1.1