From 3859a271a003aba01e45b85c9d8b355eb7bf25f9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 28 Oct 2016 01:22:25 -0700 Subject: randstruct: Mark various structs for randomization This marks many critical kernel structures for randomization. These are structures that have been targeted in the past in security exploits, or contain functions pointers, pointers to function pointer tables, lists, workqueues, ref-counters, credentials, permissions, or are otherwise sensitive. This initial list was extracted from Brad Spengler/PaX Team's code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Left out of this list is task_struct, which requires special handling and will be covered in a subsequent patch. Signed-off-by: Kees Cook --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 357348a..5616511 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -212,7 +212,7 @@ struct futex_pi_state { atomic_t refcount; union futex_key key; -}; +} __randomize_layout; /** * struct futex_q - The hashed futex queue entry, one per waiting task @@ -246,7 +246,7 @@ struct futex_q { struct rt_mutex_waiter *rt_waiter; union futex_key *requeue_pi_key; u32 bitset; -}; +} __randomize_layout; static const struct futex_q futex_q_init = { /* list gets initialized in queue_me()*/ -- cgit v1.1 From 610467270fb368584b74567edd21c8cc5104490f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Jul 2017 07:17:02 -0400 Subject: cgroup: don't call migration methods if there are no tasks to migrate Subsystem migration methods shouldn't be called for empty migrations. cgroup_migrate_execute() implements this guarantee by bailing early if there are no source css_sets. This used to be correct before a79a908fd2b0 ("cgroup: introduce cgroup namespaces"), but no longer since the commit because css_sets can stay pinned without tasks in them. This caused cgroup_migrate_execute() call into cpuset migration methods with an empty cgroup_taskset. cpuset migration methods correctly assume that cgroup_taskset_first() never returns NULL; however, due to the bug, it can, leading to the following oops. Unable to handle kernel paging request for data at address 0x00000960 Faulting instruction address: 0xc0000000001d6868 Oops: Kernel access of bad area, sig: 11 [#1] ... CPU: 14 PID: 16947 Comm: kworker/14:0 Tainted: G W 4.12.0-rc4-next-20170609 #2 Workqueue: events cpuset_hotplug_workfn task: c00000000ca60580 task.stack: c00000000c728000 NIP: c0000000001d6868 LR: c0000000001d6858 CTR: c0000000001d6810 REGS: c00000000c72b720 TRAP: 0300 Tainted: GW (4.12.0-rc4-next-20170609) MSR: 8000000000009033 CR: 44722422 XER: 20000000 CFAR: c000000000008710 DAR: 0000000000000960 DSISR: 40000000 SOFTE: 1 GPR00: c0000000001d6858 c00000000c72b9a0 c000000001536e00 0000000000000000 GPR04: c00000000c72b9c0 0000000000000000 c00000000c72bad0 c000000766367678 GPR08: c000000766366d10 c00000000c72b958 c000000001736e00 0000000000000000 GPR12: c0000000001d6810 c00000000e749300 c000000000123ef8 c000000775af4180 GPR16: 0000000000000000 0000000000000000 c00000075480e9c0 c00000075480e9e0 GPR20: c00000075480e8c0 0000000000000001 0000000000000000 c00000000c72ba20 GPR24: c00000000c72baa0 c00000000c72bac0 c000000001407248 c00000000c72ba20 GPR28: c00000000141fc80 c00000000c72bac0 c00000000c6bc790 0000000000000000 NIP [c0000000001d6868] cpuset_can_attach+0x58/0x1b0 LR [c0000000001d6858] cpuset_can_attach+0x48/0x1b0 Call Trace: [c00000000c72b9a0] [c0000000001d6858] cpuset_can_attach+0x48/0x1b0 (unreliable) [c00000000c72ba00] [c0000000001cbe80] cgroup_migrate_execute+0xb0/0x450 [c00000000c72ba80] [c0000000001d3754] cgroup_transfer_tasks+0x1c4/0x360 [c00000000c72bba0] [c0000000001d923c] cpuset_hotplug_workfn+0x86c/0xa20 [c00000000c72bca0] [c00000000011aa44] process_one_work+0x1e4/0x580 [c00000000c72bd30] [c00000000011ae78] worker_thread+0x98/0x5c0 [c00000000c72bdc0] [c000000000124058] kthread+0x168/0x1b0 [c00000000c72be30] [c00000000000b2e8] ret_from_kernel_thread+0x5c/0x74 Instruction dump: f821ffa1 7c7d1b78 60000000 60000000 38810020 7fa3eb78 3f42ffed 4bff4c25 60000000 3b5a0448 3d420020 eb610020 7f43d378 e9290000 f92af200 ---[ end trace dcaaf98fb36d9e64 ]--- This patch fixes the bug by adding an explicit nr_tasks counter to cgroup_taskset and skipping calling the migration methods if the counter is zero. While at it, remove the now spurious check on no source css_sets. Signed-off-by: Tejun Heo Reported-and-tested-by: Abdul Haleem Cc: Roman Gushchin Cc: stable@vger.kernel.org # v4.6+ Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces") Link: http://lkml.kernel.org/r/1497266622.15415.39.camel@abdul.in.ibm.com --- kernel/cgroup/cgroup-internal.h | 3 +++ kernel/cgroup/cgroup.c | 58 ++++++++++++++++++++++------------------- 2 files changed, 34 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 793565c..8b4c3c2 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -33,6 +33,9 @@ struct cgroup_taskset { struct list_head src_csets; struct list_head dst_csets; + /* the number of tasks in the set */ + int nr_tasks; + /* the subsys currently being processed */ int ssid; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 620794a..cc53111 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2006,6 +2006,8 @@ static void cgroup_migrate_add_task(struct task_struct *task, if (!cset->mg_src_cgrp) return; + mgctx->tset.nr_tasks++; + list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) list_add_tail(&cset->mg_node, @@ -2094,21 +2096,19 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) struct css_set *cset, *tmp_cset; int ssid, failed_ssid, ret; - /* methods shouldn't be called if no task is actually migrating */ - if (list_empty(&tset->src_csets)) - return 0; - /* check that we can legitimately attach to the cgroup */ - do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { - if (ss->can_attach) { - tset->ssid = ssid; - ret = ss->can_attach(tset); - if (ret) { - failed_ssid = ssid; - goto out_cancel_attach; + if (tset->nr_tasks) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { + if (ss->can_attach) { + tset->ssid = ssid; + ret = ss->can_attach(tset); + if (ret) { + failed_ssid = ssid; + goto out_cancel_attach; + } } - } - } while_each_subsys_mask(); + } while_each_subsys_mask(); + } /* * Now that we're guaranteed success, proceed to move all tasks to @@ -2137,25 +2137,29 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) */ tset->csets = &tset->dst_csets; - do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { - if (ss->attach) { - tset->ssid = ssid; - ss->attach(tset); - } - } while_each_subsys_mask(); + if (tset->nr_tasks) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { + if (ss->attach) { + tset->ssid = ssid; + ss->attach(tset); + } + } while_each_subsys_mask(); + } ret = 0; goto out_release_tset; out_cancel_attach: - do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { - if (ssid == failed_ssid) - break; - if (ss->cancel_attach) { - tset->ssid = ssid; - ss->cancel_attach(tset); - } - } while_each_subsys_mask(); + if (tset->nr_tasks) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { + if (ssid == failed_ssid) + break; + if (ss->cancel_attach) { + tset->ssid = ssid; + ss->cancel_attach(tset); + } + } while_each_subsys_mask(); + } out_release_tset: spin_lock_irq(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); -- cgit v1.1 From 6a8a75f3235724c5941a33e287b2f98966ad14c5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Jul 2017 10:56:54 +0200 Subject: Revert "perf/core: Drop kernel samples even though :u is specified" This reverts commit cc1582c231ea041fbc68861dfaf957eaf902b829. This commit introduced a regression that broke rr-project, which uses sampling events to receive a signal on overflow (but does not care about the contents of the sample). These signals are critical to the correct operation of rr. There's been some back and forth about how to fix it - but to not keep applications in limbo queue up a revert. Reported-by: Kyle Huey Acked-by: Kyle Huey Acked-by: Peter Zijlstra Cc: Jin Yao Cc: Vince Weaver Cc: Linus Torvalds Cc: Will Deacon Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Stephane Eranian Cc: Namhyung Kim Cc: Jiri Olsa Cc: Link: http://lkml.kernel.org/r/20170628105600.GC5981@leverpostej Signed-off-by: Ingo Molnar --- kernel/events/core.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4d2c32f..9747e42 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7308,21 +7308,6 @@ int perf_event_account_interrupt(struct perf_event *event) return __perf_event_account_interrupt(event, 1); } -static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) -{ - /* - * Due to interrupt latency (AKA "skid"), we may enter the - * kernel before taking an overflow, even if the PMU is only - * counting user events. - * To avoid leaking information to userspace, we must always - * reject kernel samples when exclude_kernel is set. - */ - if (event->attr.exclude_kernel && !user_mode(regs)) - return false; - - return true; -} - /* * Generic event overflow handling, sampling. */ @@ -7344,12 +7329,6 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); /* - * For security, drop the skid kernel samples if necessary. - */ - if (!sample_is_allowed(event, regs)) - return ret; - - /* * XXX event_limit might not quite work as expected on inherited * events */ -- cgit v1.1 From 69f0d429c413fe96db2c187475cebcc6e3a8c7f5 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 13 Jul 2017 14:18:24 +0800 Subject: locking/rtmutex: Remove unnecessary priority adjustment We don't need to adjust priority before adding a new pi_waiter, the priority only needs to be updated after pi_waiter change or task priority change. Steven Rostedt pointed out: "Interesting, I did some git mining and this was added with the original entry of the rtmutex.c (23f78d4a03c5). Looking at even that version, I don't see the purpose of adjusting the task prio here. It is done before anything changes in the task." Signed-off-by: Alex Shi Reviewed-by: Steven Rostedt (VMware) Acked-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1499926704-28841-1-git-send-email-alex.shi@linaro.org [ Enhance the changelog. ] Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 7806989..649dc9d 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -963,7 +963,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, return -EDEADLK; raw_spin_lock(&task->pi_lock); - rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; waiter->prio = task->prio; -- cgit v1.1 From 0e4097c3354e2f5a5ad8affd9dc7f7f7d00bb6b9 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sun, 9 Jul 2017 00:40:28 -0700 Subject: sched/cputime: Don't use smp_processor_id() in preemptible context Recent kernels trigger this warning: BUG: using smp_processor_id() in preemptible [00000000] code: 99-trinity/181 caller is debug_smp_processor_id+0x17/0x19 CPU: 0 PID: 181 Comm: 99-trinity Not tainted 4.12.0-01059-g2a42eb9 #1 Call Trace: dump_stack+0x82/0xb8 check_preemption_disabled() debug_smp_processor_id() vtime_delta() task_cputime() thread_group_cputime() thread_group_cputime_adjusted() wait_consider_task() do_wait() SYSC_wait4() do_syscall_64() entry_SYSCALL64_slow_path() As Frederic pointed out: | Although those sched_clock_cpu() things seem to only matter when the | sched_clock() is unstable. And that stability is a condition for nohz_full | to work anyway. So probably sched_clock() alone would be enough. This patch fixes it by replacing sched_clock_cpu() with sched_clock() to avoid calling smp_processor_id() in a preemptible context. Reported-by: Xiaolong Ye Signed-off-by: Wanpeng Li Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1499586028-7402-1-git-send-email-wanpeng.li@hotmail.com [ Prettified the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 6e3ea4a..14d2dbf 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -683,7 +683,7 @@ static u64 vtime_delta(struct vtime *vtime) { unsigned long long clock; - clock = sched_clock_cpu(smp_processor_id()); + clock = sched_clock(); if (clock < vtime->starttime) return 0; @@ -814,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; - vtime->starttime = sched_clock_cpu(smp_processor_id()); + vtime->starttime = sched_clock(); write_seqcount_end(&vtime->seqcount); } @@ -826,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; - vtime->starttime = sched_clock_cpu(cpu); + vtime->starttime = sched_clock(); write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } -- cgit v1.1 From 193be41e33168a3a06eb9d356d9e39c69de161d2 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 12 Jul 2017 19:24:29 -0700 Subject: sched/deadline: Fix confusing comments about selection of top pi-waiter This comment in the code is incomplete, and I believe it begs a definition of dl_boosted to make sense of the condition that follows. Rewrite the comment and also rearrange the condition that follows to reflect the first condition "we have a top pi-waiter which is a SCHED_DEADLINE task" in that order. Also fix a typo that follows. Signed-off-by: Joel Fernandes Reviewed-by: Daniel Bristot de Oliveira Acked-by: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170713022429.10307-1-joelaf@google.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a84299f..755bd3f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1392,17 +1392,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) struct sched_dl_entity *pi_se = &p->dl; /* - * Use the scheduling parameters of the top pi-waiter - * task if we have one and its (absolute) deadline is - * smaller than our one... OTW we keep our runtime and - * deadline. + * Use the scheduling parameters of the top pi-waiter task if: + * - we have a top pi-waiter which is a SCHED_DEADLINE task AND + * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is + * smaller than our deadline OR we are a !SCHED_DEADLINE task getting + * boosted due to a SCHED_DEADLINE pi-waiter). + * Otherwise we keep our runtime and deadline. */ - if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { + if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { pi_se = &pi_task->dl; } else if (!dl_prio(p->normal_prio)) { /* * Special case in which we have a !SCHED_DEADLINE task - * that is going to be deboosted, but exceedes its + * that is going to be deboosted, but exceeds its * runtime while doing so. No point in replenishing * it, as it's going to return back to its original * scheduling class after this. -- cgit v1.1 From a696712c3dd54eb58d2c5a807b4aaa27782d80d6 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 17 Jul 2017 19:47:02 +0200 Subject: genirq/PM: Properly pretend disabled state when force resuming interrupts Interrupts with the IRQF_FORCE_RESUME flag set have also the IRQF_NO_SUSPEND flag set. They are not disabled in the suspend path, but must be forcefully resumed. That's used by XEN to keep IPIs enabled beyond the suspension of device irqs. Force resume works by pretending that the interrupt was disabled and then calling __irq_enable(). Incrementing the disabled depth counter was enough to do that, but with the recent changes which use state flags to avoid unnecessary hardware access, this is not longer sufficient. If the state flags are not set, then the hardware callbacks are not invoked and the interrupt line stays disabled in "hardware". Set the disabled and masked state when pretending that an interrupt got disabled by suspend. Fixes: bf22ff45bed6 ("genirq: Avoid unnecessary low level irq function calls") Suggested-by: Thomas Gleixner Signed-off-by: Juergen Gross Signed-off-by: Thomas Gleixner Cc: xen-devel@lists.xenproject.org Cc: boris.ostrovsky@oracle.com Link: http://lkml.kernel.org/r/20170717174703.4603-2-jgross@suse.com --- kernel/irq/chip.c | 10 ---------- kernel/irq/internals.h | 10 ++++++++++ kernel/irq/pm.c | 2 ++ 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d171bc5..a3cc37c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -170,21 +170,11 @@ static void irq_state_clr_disabled(struct irq_desc *desc) irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); } -static void irq_state_set_disabled(struct irq_desc *desc) -{ - irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); -} - static void irq_state_clr_masked(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); } -static void irq_state_set_masked(struct irq_desc *desc) -{ - irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); -} - static void irq_state_clr_started(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index dbfba99..a2c4805 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -227,6 +227,16 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) return __irqd_to_state(d) & mask; } +static inline void irq_state_set_disabled(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); +} + +static inline void irq_state_set_masked(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); +} + #undef __irqd_to_state static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cea1de0..6bd9b58 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -149,6 +149,8 @@ static void resume_irq(struct irq_desc *desc) /* Pretend that it got disabled ! */ desc->depth++; + irq_state_set_disabled(desc); + irq_state_set_masked(desc); resume: desc->istate &= ~IRQS_SUSPENDED; __enable_irq(desc); -- cgit v1.1 From 7af608e4f9530372aec6e940552bf76595f2e265 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jul 2017 17:57:46 -0400 Subject: cgroup: create dfl_root files on subsys registration On subsystem registration, css_populate_dir() is not called on the new root css, so the interface files for the subsystem on cgrp_dfl_root aren't created on registration. This is a residue from the days when cgrp_dfl_root was used only as the parking spot for unused subsystems, which no longer is true as it's used as the root for cgroup2. This is often fine as later operations tend to create them as a part of mount (cgroup1) or subtree_control operations (cgroup2); however, it's not difficult to mount cgroup2 with the controller interface files missing as Waiman found out. Fix it by invoking css_populate_dir() on the root css on subsys registration. Signed-off-by: Tejun Heo Reported-and-tested-by: Waiman Long Cc: stable@vger.kernel.org # v4.5+ Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cc53111..7449759 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4673,6 +4673,10 @@ int __init cgroup_init(void) if (ss->bind) ss->bind(init_css_set.subsys[ssid]); + + mutex_lock(&cgroup_mutex); + css_populate_dir(init_css_set.subsys[ssid]); + mutex_unlock(&cgroup_mutex); } /* init_css_set.subsys[] has been updated, re-hash */ -- cgit v1.1 From 848618857d2535176037bdc085f8d012d907071f Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 12 Jul 2017 19:14:16 -0700 Subject: tracing/ring_buffer: Try harder to allocate ftrace can fail to allocate per-CPU ring buffer on systems with a large number of CPUs coupled while large amounts of cache happening in the page cache. Currently the ring buffer allocation doesn't retry in the VM implementation even if direct-reclaim made some progress but still wasn't able to find a free page. On retrying I see that the allocations almost always succeed. The retry doesn't happen because __GFP_NORETRY is used in the tracer to prevent the case where we might OOM, however if we drop __GFP_NORETRY, we risk destabilizing the system if OOM killer is triggered. To prevent this situation, use the __GFP_RETRY_MAYFAIL flag introduced recently [1]. Tested the following still succeeds without destabilizing a system with 1GB memory. echo 300000 > /sys/kernel/debug/tracing/buffer_size_kb [1] https://marc.info/?l=linux-mm&m=149820805124906&w=2 Link: http://lkml.kernel.org/r/20170713021416.8897-1-joelaf@google.com Cc: Tim Murray Cc: Ingo Molnar Cc: Andrew Morton Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 4ae268e..529cc50 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1136,12 +1136,12 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) for (i = 0; i < nr_pages; i++) { struct page *page; /* - * __GFP_NORETRY flag makes sure that the allocation fails - * gracefully without invoking oom-killer and the system is - * not destabilized. + * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is not + * destabilized. */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL | __GFP_NORETRY, + GFP_KERNEL | __GFP_RETRY_MAYFAIL, cpu_to_node(cpu)); if (!bpage) goto free_pages; @@ -1149,7 +1149,7 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) list_add(&bpage->list, pages); page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY, 0); + GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0); if (!page) goto free_pages; bpage->page = page_address(page); -- cgit v1.1 From b0659ae5e30074ede1dc08f2c6d64f0c11d64e0f Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Tue, 18 Jul 2017 14:37:24 +0800 Subject: audit: fix memleak in auditd_send_unicast_skb. Found this issue by kmemleak report, auditd_send_unicast_skb did not free skb if rcu_dereference(auditd_conn) returns null. unreferenced object 0xffff88082568ce00 (size 256): comm "auditd", pid 1119, jiffies 4294708499 backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc_node+0xcc/0x210 [] __alloc_skb+0x5d/0x290 [] audit_make_reply+0x54/0xd0 [] audit_receive_msg+0x967/0xd70 ---------------- (gdb) list *audit_receive_msg+0x967 0xffffffff8113dff7 is in audit_receive_msg (kernel/audit.c:1133). 1132 skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr)); --------------- [] audit_receive+0x52/0xa0 [] netlink_unicast+0x181/0x240 [] netlink_sendmsg+0x2c2/0x3b0 [] sock_sendmsg+0x38/0x50 [] SYSC_sendto+0x102/0x190 [] SyS_sendto+0xe/0x10 [] entry_SYSCALL_64_fastpath+0x1a/0xa5 [] 0xffffffffffffffff Signed-off-by: Shu Wang Signed-off-by: Paul Moore --- kernel/audit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 7cad702..07def5e 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -641,6 +641,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); + kfree_skb(skb); rc = -ECONNREFUSED; goto err; } -- cgit v1.1 From 5c0338c68706be53b3dc472e4308961c36e4ece1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jul 2017 18:41:52 -0400 Subject: workqueue: restore WQ_UNBOUND/max_active==1 to be ordered The combination of WQ_UNBOUND and max_active == 1 used to imply ordered execution. After NUMA affinity 4c16bd327c74 ("workqueue: implement NUMA affinity for unbound workqueues"), this is no longer true due to per-node worker pools. While the right way to create an ordered workqueue is alloc_ordered_workqueue(), the documentation has been misleading for a long time and people do use WQ_UNBOUND and max_active == 1 for ordered workqueues which can lead to subtle bugs which are very difficult to trigger. It's unlikely that we'd see noticeable performance impact by enforcing ordering on WQ_UNBOUND / max_active == 1 workqueues. Let's automatically set __WQ_ORDERED for those workqueues. Signed-off-by: Tejun Heo Reported-by: Christoph Hellwig Reported-by: Alexei Potashnik Fixes: 4c16bd327c74 ("workqueue: implement NUMA affinity for unbound workqueues") Cc: stable@vger.kernel.org # v3.10+ --- kernel/workqueue.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a86688f..abe4a49 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3929,6 +3929,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct workqueue_struct *wq; struct pool_workqueue *pwq; + /* + * Unbound && max_active == 1 used to imply ordered, which is no + * longer the case on NUMA machines due to per-node pools. While + * alloc_ordered_workqueue() is the right way to create an ordered + * workqueue, keep the previous behavior to avoid subtle breakages + * on NUMA. + */ + if ((flags & WQ_UNBOUND) && max_active == 1) + flags |= __WQ_ORDERED; + /* see the comment above the definition of WQ_POWER_EFFICIENT */ if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) flags |= WQ_UNBOUND; -- cgit v1.1 From 3bda69c1c3993a2bddbae01397d12bfef6054011 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 18 Jul 2017 14:08:34 +0300 Subject: perf/core: Fix scheduling regression of pinned groups Vince Weaver reported: > I was tracking down some regressions in my perf_event_test testsuite. > Some of the tests broke in the 4.11-rc1 timeframe. > > I've bisected one of them, this report is about > tests/overflow/simul_oneshot_group_overflow > This test creates an event group containing two sampling events, set > to overflow to a signal handler (which disables and then refreshes the > event). > > On a good kernel you get the following: > Event perf::instructions with period 1000000 > Event perf::instructions with period 2000000 > fd 3 overflows: 946 (perf::instructions/1000000) > fd 4 overflows: 473 (perf::instructions/2000000) > Ending counts: > Count 0: 946379875 > Count 1: 946365218 > > With the broken kernels you get: > Event perf::instructions with period 1000000 > Event perf::instructions with period 2000000 > fd 3 overflows: 938 (perf::instructions/1000000) > fd 4 overflows: 318 (perf::instructions/2000000) > Ending counts: > Count 0: 946373080 > Count 1: 653373058 The root cause of the bug is that the following commit: 487f05e18a ("perf/core: Optimize event rescheduling on active contexts") erronously assumed that event's 'pinned' setting determines whether the event belongs to a pinned group or not, but in fact, it's the group leader's pinned state that matters. This was discovered by Vince in the test case described above, where two instruction counters are grouped, the group leader is pinned, but the other event is not; in the regressed case the counters were off by 33% (the difference between events' periods), but should be the same within the error margin. Fix the problem by looking at the group leader's pinning. Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 487f05e18a ("perf/core: Optimize event rescheduling on active contexts") Link: http://lkml.kernel.org/r/87lgnmvw7h.fsf@ashishki-desk.ger.corp.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9747e42..c9cdbd3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1452,6 +1452,13 @@ static enum event_type_t get_event_type(struct perf_event *event) lockdep_assert_held(&ctx->lock); + /* + * It's 'group type', really, because if our group leader is + * pinned, so are we. + */ + if (event->group_leader != event) + event = event->group_leader; + event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; if (!ctx->task) event_type |= EVENT_CPU; -- cgit v1.1 From db9108e054700c96322b0f0028546aa4e643cf0b Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Thu, 20 Jul 2017 18:36:09 +0800 Subject: tracing: Fix kmemleak in instance_rmdir Hit the kmemleak when executing instance_rmdir, it forgot releasing mem of tracing_cpumask. With this fix, the warn does not appear any more. unreferenced object 0xffff93a8dfaa7c18 (size 8): comm "mkdir", pid 1436, jiffies 4294763622 (age 9134.308s) hex dump (first 8 bytes): ff ff ff ff ff ff ff ff ........ backtrace: [] kmemleak_alloc+0x4a/0xa0 [] __kmalloc_node+0xf1/0x280 [] alloc_cpumask_var_node+0x23/0x30 [] alloc_cpumask_var+0xe/0x10 [] instance_mkdir+0x90/0x240 [] tracefs_syscall_mkdir+0x40/0x70 [] vfs_mkdir+0x109/0x1b0 [] SyS_mkdir+0xd0/0x100 [] do_syscall_64+0x67/0x150 [] return_from_SYSCALL_64+0x0/0x6a [] 0xffffffffffffffff Link: http://lkml.kernel.org/r/1500546969-12594-1-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: ccfe9e42e451 ("tracing: Make tracing_cpumask available for all instances") Signed-off-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2d0ffcc..42b9355 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7774,6 +7774,7 @@ static int instance_rmdir(const char *name) } kfree(tr->topts); + free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); -- cgit v1.1 From f86f418059b94aa01f9342611a272ca60c583e89 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Wed, 7 Jun 2017 16:12:51 +0800 Subject: trace: fix the errors caused by incompatible type of RCU variables The variables which are processed by RCU functions should be annotated as RCU, otherwise sparse will report the errors like below: "error: incompatible types in comparison expression (different address spaces)" Link: http://lkml.kernel.org/r/1496823171-7758-1-git-send-email-zhang.chunyan@linaro.org Signed-off-by: Chunyan Zhang [ Updated to not be 100% 80 column strict ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 41 +++++++++++++++++++++++++++-------------- kernel/trace/trace.h | 6 +++--- 2 files changed, 30 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 53f6b64..02004ae9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -113,7 +113,7 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; @@ -169,8 +169,11 @@ int ftrace_nr_registered_ops(void) mutex_lock(&ftrace_lock); - for (ops = ftrace_ops_list; - ops != &ftrace_list_end; ops = ops->next) + for (ops = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); + ops != &ftrace_list_end; + ops = rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock))) cnt++; mutex_unlock(&ftrace_lock); @@ -275,10 +278,11 @@ static void update_ftrace_function(void) * If there's only one ftrace_ops registered, the ftrace_ops_list * will point to the ops we want. */ - set_function_trace_op = ftrace_ops_list; + set_function_trace_op = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); /* If there's no ftrace_ops registered, just call the stub function */ - if (ftrace_ops_list == &ftrace_list_end) { + if (set_function_trace_op == &ftrace_list_end) { func = ftrace_stub; /* @@ -286,7 +290,8 @@ static void update_ftrace_function(void) * recursion safe and not dynamic and the arch supports passing ops, * then have the mcount trampoline call the function directly. */ - } else if (ftrace_ops_list->next == &ftrace_list_end) { + } else if (rcu_dereference_protected(ftrace_ops_list->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { func = ftrace_ops_get_list_func(ftrace_ops_list); } else { @@ -348,9 +353,11 @@ int using_ftrace_ops_list_func(void) return ftrace_trace_function == ftrace_ops_list_func; } -static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +static void add_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) { - ops->next = *list; + rcu_assign_pointer(ops->next, *list); + /* * We are entering ops into the list but another * CPU might be walking that list. We need to make sure @@ -360,7 +367,8 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) rcu_assign_pointer(*list, ops); } -static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +static int remove_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) { struct ftrace_ops **p; @@ -368,7 +376,10 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) * If we are removing the last function, then simply point * to the ftrace_stub. */ - if (*list == ops && ops->next == &ftrace_list_end) { + if (rcu_dereference_protected(*list, + lockdep_is_held(&ftrace_lock)) == ops && + rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { *list = &ftrace_list_end; return 0; } @@ -1569,8 +1580,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 0; #endif - hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); - hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); + rcu_assign_pointer(hash.filter_hash, ops->func_hash->filter_hash); + rcu_assign_pointer(hash.notrace_hash, ops->func_hash->notrace_hash); if (hash_contains_ip(ip, &hash)) ret = 1; @@ -2840,7 +2851,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * If there's no more ops registered with ftrace, run a * sanity check to make sure all rec flags are cleared. */ - if (ftrace_ops_list == &ftrace_list_end) { + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -6453,7 +6465,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ftrace_enabled) { /* we are starting ftrace again */ - if (ftrace_ops_list != &ftrace_list_end) + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) != &ftrace_list_end) update_ftrace_function(); ftrace_startup_sysctl(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6ade1c5..490ba22 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1210,9 +1210,9 @@ struct ftrace_event_field { struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ - struct filter_pred *preds; - struct filter_pred *root; - char *filter_string; + struct filter_pred __rcu *preds; + struct filter_pred __rcu *root; + char *filter_string; }; struct event_subsystem { -- cgit v1.1 From 4cabc5b186b5427b9ee5a7495172542af105f02b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 21 Jul 2017 00:00:21 +0200 Subject: bpf: fix mixed signed/unsigned derived min/max value bounds Edward reported that there's an issue in min/max value bounds tracking when signed and unsigned compares both provide hints on limits when having unknown variables. E.g. a program such as the following should have been rejected: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff8a94cda93400 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+7 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = -1 10: (2d) if r1 > r2 goto pc+3 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 11: (65) if r1 s> 0x1 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0,max_value=1 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 12: (0f) r0 += r1 13: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=0,max_value=1 R1=inv,min_value=0,max_value=1 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 14: (b7) r0 = 0 15: (95) exit What happens is that in the first part ... 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = -1 10: (2d) if r1 > r2 goto pc+3 ... r1 carries an unsigned value, and is compared as unsigned against a register carrying an immediate. Verifier deduces in reg_set_min_max() that since the compare is unsigned and operation is greater than (>), that in the fall-through/false case, r1's minimum bound must be 0 and maximum bound must be r2. Latter is larger than the bound and thus max value is reset back to being 'invalid' aka BPF_REGISTER_MAX_RANGE. Thus, r1 state is now 'R1=inv,min_value=0'. The subsequent test ... 11: (65) if r1 s> 0x1 goto pc+2 ... is a signed compare of r1 with immediate value 1. Here, verifier deduces in reg_set_min_max() that since the compare is signed this time and operation is greater than (>), that in the fall-through/false case, we can deduce that r1's maximum bound must be 1, meaning with prior test, we result in r1 having the following state: R1=inv,min_value=0,max_value=1. Given that the actual value this holds is -8, the bounds are wrongly deduced. When this is being added to r0 which holds the map_value(_adj) type, then subsequent store access in above case will go through check_mem_access() which invokes check_map_access_adj(), that will then probe whether the map memory is in bounds based on the min_value and max_value as well as access size since the actual unknown value is min_value <= x <= max_value; commit fce366a9dd0d ("bpf, verifier: fix alu ops against map_value{, _adj} register types") provides some more explanation on the semantics. It's worth to note in this context that in the current code, min_value and max_value tracking are used for two things, i) dynamic map value access via check_map_access_adj() and since commit 06c1c049721a ("bpf: allow helpers access to variable memory") ii) also enforced at check_helper_mem_access() when passing a memory address (pointer to packet, map value, stack) and length pair to a helper and the length in this case is an unknown value defining an access range through min_value/max_value in that case. The min_value/max_value tracking is /not/ used in the direct packet access case to track ranges. However, the issue also affects case ii), for example, the following crafted program based on the same principle must be rejected as well: 0: (b7) r2 = 0 1: (bf) r3 = r10 2: (07) r3 += -512 3: (7a) *(u64 *)(r10 -16) = -8 4: (79) r4 = *(u64 *)(r10 -16) 5: (b7) r6 = -1 6: (2d) if r4 > r6 goto pc+5 R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512 R4=inv,min_value=0 R6=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 7: (65) if r4 s> 0x1 goto pc+4 R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512 R4=inv,min_value=0,max_value=1 R6=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 8: (07) r4 += 1 9: (b7) r5 = 0 10: (6a) *(u16 *)(r10 -512) = 0 11: (85) call bpf_skb_load_bytes#26 12: (b7) r0 = 0 13: (95) exit Meaning, while we initialize the max_value stack slot that the verifier thinks we access in the [1,2] range, in reality we pass -7 as length which is interpreted as u32 in the helper. Thus, this issue is relevant also for the case of helper ranges. Resetting both bounds in check_reg_overflow() in case only one of them exceeds limits is also not enough as similar test can be created that uses values which are within range, thus also here learned min value in r1 is incorrect when mixed with later signed test to create a range: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff880ad081fa00 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+7 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = 2 10: (3d) if r2 >= r1 goto pc+3 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 11: (65) if r1 s> 0x4 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 12: (0f) r0 += r1 13: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=3,max_value=4 R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 14: (b7) r0 = 0 15: (95) exit This leaves us with two options for fixing this: i) to invalidate all prior learned information once we switch signed context, ii) to track min/max signed and unsigned boundaries separately as done in [0]. (Given latter introduces major changes throughout the whole verifier, it's rather net-next material, thus this patch follows option i), meaning we can derive bounds either from only signed tests or only unsigned tests.) There is still the case of adjust_reg_min_max_vals(), where we adjust bounds on ALU operations, meaning programs like the following where boundaries on the reg get mixed in context later on when bounds are merged on the dst reg must get rejected, too: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff89b2bf87ce00 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+6 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = 2 10: (3d) if r2 >= r1 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 11: (b7) r7 = 1 12: (65) if r7 s> 0x0 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,max_value=0 R10=fp 13: (b7) r0 = 0 14: (95) exit from 12 to 15: R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,min_value=1 R10=fp 15: (0f) r7 += r1 16: (65) if r7 s> 0x4 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp 17: (0f) r0 += r7 18: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=4,max_value=4 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp 19: (b7) r0 = 0 20: (95) exit Meaning, in adjust_reg_min_max_vals() we must also reset range values on the dst when src/dst registers have mixed signed/ unsigned derived min/max value bounds with one unbounded value as otherwise they can be added together deducing false boundaries. Once both boundaries are established from either ALU ops or compare operations w/o mixing signed/unsigned insns, then they can safely be added to other regs also having both boundaries established. Adding regs with one unbounded side to a map value where the bounded side has been learned w/o mixing ops is possible, but the resulting map value won't recover from that, meaning such op is considered invalid on the time of actual access. Invalid bounds are set on the dst reg in case i) src reg, or ii) in case dst reg already had them. The only way to recover would be to perform i) ALU ops but only 'add' is allowed on map value types or ii) comparisons, but these are disallowed on pointers in case they span a range. This is fine as only BPF_JEQ and BPF_JNE may be performed on PTR_TO_MAP_VALUE_OR_NULL registers which potentially turn them into PTR_TO_MAP_VALUE type depending on the branch, so only here min/max value cannot be invalidated for them. In terms of state pruning, value_from_signed is considered as well in states_equal() when dealing with adjusted map values. With regards to breaking existing programs, there is a small risk, but use-cases are rather quite narrow where this could occur and mixing compares probably unlikely. Joint work with Josef and Edward. [0] https://lists.iovisor.org/pipermail/iovisor-dev/2017-June/000822.html Fixes: 484611357c19 ("bpf: allow access into map value arrays") Reported-by: Edward Cree Signed-off-by: Daniel Borkmann Signed-off-by: Edward Cree Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 108 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 94 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a86723..af9e84a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -504,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) { regs[regno].min_value = BPF_REGISTER_MIN_RANGE; regs[regno].max_value = BPF_REGISTER_MAX_RANGE; + regs[regno].value_from_signed = false; regs[regno].min_align = 0; } @@ -777,12 +778,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return -EACCES; } -static bool is_pointer_value(struct bpf_verifier_env *env, int regno) +static bool __is_pointer_value(bool allow_ptr_leaks, + const struct bpf_reg_state *reg) { - if (env->allow_ptr_leaks) + if (allow_ptr_leaks) return false; - switch (env->cur_state.regs[regno].type) { + switch (reg->type) { case UNKNOWN_VALUE: case CONST_IMM: return false; @@ -791,6 +793,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) } } +static bool is_pointer_value(struct bpf_verifier_env *env, int regno) +{ + return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); +} + static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, int off, int size, bool strict) { @@ -1832,10 +1839,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_align = dst_reg->min_align; /* We don't know anything about what was done to this register, mark it - * as unknown. + * as unknown. Also, if both derived bounds came from signed/unsigned + * mixed compares and one side is unbounded, we cannot really do anything + * with them as boundaries cannot be trusted. Thus, arithmetic of two + * regs of such kind will get invalidated bounds on the dst side. */ - if (min_val == BPF_REGISTER_MIN_RANGE && - max_val == BPF_REGISTER_MAX_RANGE) { + if ((min_val == BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) || + (BPF_SRC(insn->code) == BPF_X && + ((min_val != BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) || + (min_val == BPF_REGISTER_MIN_RANGE && + max_val != BPF_REGISTER_MAX_RANGE) || + (dst_reg->min_value != BPF_REGISTER_MIN_RANGE && + dst_reg->max_value == BPF_REGISTER_MAX_RANGE) || + (dst_reg->min_value == BPF_REGISTER_MIN_RANGE && + dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) && + regs[insn->dst_reg].value_from_signed != + regs[insn->src_reg].value_from_signed)) { reset_reg_range_values(regs, insn->dst_reg); return; } @@ -2023,6 +2044,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[insn->dst_reg].max_value = insn->imm; regs[insn->dst_reg].min_value = insn->imm; regs[insn->dst_reg].min_align = calc_align(insn->imm); + regs[insn->dst_reg].value_from_signed = false; } } else if (opcode > BPF_END) { @@ -2198,40 +2220,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + bool value_from_signed = true; + bool is_range = true; + switch (opcode) { case BPF_JEQ: /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; + is_range = false; break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; + is_range = false; break; case BPF_JGT: - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGT: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGT) { + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + } /* If this is false then we know the maximum val is val, * otherwise we know the min val is val+1. */ false_reg->max_value = val; + false_reg->value_from_signed = value_from_signed; true_reg->min_value = val + 1; + true_reg->value_from_signed = value_from_signed; break; case BPF_JGE: - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGE: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGE) { + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + } /* If this is false then we know the maximum value is val - 1, * otherwise we know the mimimum value is val. */ false_reg->max_value = val - 1; + false_reg->value_from_signed = value_from_signed; true_reg->min_value = val; + true_reg->value_from_signed = value_from_signed; break; default: break; @@ -2239,6 +2284,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); + if (is_range) { + if (__is_pointer_value(false, false_reg)) + reset_reg_range_values(false_reg, 0); + if (__is_pointer_value(false, true_reg)) + reset_reg_range_values(true_reg, 0); + } } /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg @@ -2248,41 +2299,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + bool value_from_signed = true; + bool is_range = true; + switch (opcode) { case BPF_JEQ: /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; + is_range = false; break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; + is_range = false; break; case BPF_JGT: - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGT: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGT) { + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + } /* * If this is false, then the val is <= the register, if it is * true the register <= to the val. */ false_reg->min_value = val; + false_reg->value_from_signed = value_from_signed; true_reg->max_value = val - 1; + true_reg->value_from_signed = value_from_signed; break; case BPF_JGE: - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGE: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGE) { + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + } /* If this is false then constant < register, if it is true then * the register < constant. */ false_reg->min_value = val + 1; + false_reg->value_from_signed = value_from_signed; true_reg->max_value = val; + true_reg->value_from_signed = value_from_signed; break; default: break; @@ -2290,6 +2364,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); + if (is_range) { + if (__is_pointer_value(false, false_reg)) + reset_reg_range_values(false_reg, 0); + if (__is_pointer_value(false, true_reg)) + reset_reg_range_values(true_reg, 0); + } } static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, -- cgit v1.1 From 2aeb1883547626d82c597cce2c99f0b9c62e2425 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 20 Jul 2017 16:14:55 +0200 Subject: perf/core: Fix locking for children siblings group read We're missing ctx lock when iterating children siblings within the perf_read path for group reading. Following race and crash can happen: User space doing read syscall on event group leader: T1: perf_read lock event->ctx->mutex perf_read_group lock leader->child_mutex __perf_read_group_add(child) list_for_each_entry(sub, &leader->sibling_list, group_entry) ----> sub might be invalid at this point, because it could get removed via perf_event_exit_task_context in T2 Child exiting and cleaning up its events: T2: perf_event_exit_task_context lock ctx->mutex list_for_each_entry_safe(child_event, next, &child_ctx->event_list,... perf_event_exit_event(child) lock ctx->lock perf_group_detach(child) unlock ctx->lock ----> child is removed from sibling_list without any sync with T1 path above ... free_event(child) Before the child is removed from the leader's child_list, (and thus is omitted from perf_read_group processing), we need to ensure that perf_read_group touches child's siblings under its ctx->lock. Peter further notes: | One additional note; this bug got exposed by commit: | | ba5213ae6b88 ("perf/core: Correct event creation with PERF_FORMAT_GROUP") | | which made it possible to actually trigger this code-path. Tested-by: Andi Kleen Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: ba5213ae6b88 ("perf/core: Correct event creation with PERF_FORMAT_GROUP") Link: http://lkml.kernel.org/r/20170720141455.2106-1-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c9cdbd3..c17c088 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4372,7 +4372,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { + struct perf_event_context *ctx = leader->ctx; struct perf_event *sub; + unsigned long flags; int n = 1; /* skip @nr */ int ret; @@ -4402,12 +4404,15 @@ static int __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + raw_spin_lock_irqsave(&ctx->lock, flags); + list_for_each_entry(sub, &leader->sibling_list, group_entry) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); } + raw_spin_unlock_irqrestore(&ctx->lock, flags); return 0; } -- cgit v1.1 From 3c74541777302eec43a0d1327c4d58b8659a776b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jul 2017 08:14:15 -0400 Subject: cgroup: fix error return value from cgroup_subtree_control() While refactoring, f7b2814bb9b6 ("cgroup: factor out cgroup_{apply|finalize}_control() from cgroup_subtree_control_write()") broke error return value from the function. The return value from the last operation is always overridden to zero. Fix it. Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v4.6+ Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7449759..df2e0f1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3001,11 +3001,11 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgrp->subtree_control &= ~disable; ret = cgroup_apply_control(cgrp); - cgroup_finalize_control(cgrp, ret); + if (ret) + goto out_unlock; kernfs_activate(cgrp->kn); - ret = 0; out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; -- cgit v1.1 From 9305706c2e808ae59f1eb201867f82f1ddf6d7a6 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 21 Jul 2017 14:37:34 +0100 Subject: bpf/verifier: fix min/max handling in BPF_SUB We have to subtract the src max from the dst min, and vice-versa, since (e.g.) the smallest result comes from the largest subtrahend. Fixes: 484611357c19 ("bpf: allow access into map value arrays") Signed-off-by: Edward Cree Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index af9e84a..664d939 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1865,10 +1865,12 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, * do our normal operations to the register, we need to set the values * to the min/max since they are undefined. */ - if (min_val == BPF_REGISTER_MIN_RANGE) - dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - if (max_val == BPF_REGISTER_MAX_RANGE) - dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + if (opcode != BPF_SUB) { + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + if (max_val == BPF_REGISTER_MAX_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + } switch (opcode) { case BPF_ADD: @@ -1879,10 +1881,17 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_reg->min_align = min(src_align, dst_align); break; case BPF_SUB: + /* If one of our values was at the end of our ranges, then the + * _opposite_ value in the dst_reg goes to the end of our range. + */ + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + if (max_val == BPF_REGISTER_MAX_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) - dst_reg->min_value -= min_val; + dst_reg->min_value -= max_val; if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) - dst_reg->max_value -= max_val; + dst_reg->max_value -= min_val; dst_reg->min_align = min(src_align, dst_align); break; case BPF_MUL: -- cgit v1.1 From bf50f0e8a03005d19de66d01261d855cdeedf572 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Mon, 24 Jul 2017 13:56:28 -0600 Subject: sched/core: Fix some documentation build warnings The kerneldoc comments for try_to_wake_up_local() were out of date, leading to these documentation build warnings: ./kernel/sched/core.c:2080: warning: No description found for parameter 'rf' ./kernel/sched/core.c:2080: warning: Excess function parameter 'cookie' description in 'try_to_wake_up_local' Update the comment to reflect current reality and give us some peace and quiet. Signed-off-by: Jonathan Corbet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/20170724135628.695cecfc@lwn.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 17c667b..0869b20 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2069,7 +2069,7 @@ out: /** * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened - * @cookie: context's cookie for pinning + * @rf: request-queue flags for pinning * * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not -- cgit v1.1 From 0a94efb5acbb6980d7c9ab604372d93cd507e4d8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jul 2017 08:36:15 -0400 Subject: workqueue: implicit ordered attribute should be overridable 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") automatically enabled ordered attribute for unbound workqueues w/ max_active == 1. Because ordered workqueues reject max_active and some attribute changes, this implicit ordered mode broke cases where the user creates an unbound workqueue w/ max_active == 1 and later explicitly changes the related attributes. This patch distinguishes explicit and implicit ordered setting and overrides from attribute changes if implict. Signed-off-by: Tejun Heo Fixes: 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") --- kernel/workqueue.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index abe4a49..7146ea7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3744,8 +3744,12 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, return -EINVAL; /* creating multiple pwqs breaks ordering guarantee */ - if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) - return -EINVAL; + if (!list_empty(&wq->pwqs)) { + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) + return -EINVAL; + + wq->flags &= ~__WQ_ORDERED; + } ctx = apply_wqattrs_prepare(wq, attrs); if (!ctx) @@ -4129,13 +4133,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) struct pool_workqueue *pwq; /* disallow meddling with max_active for ordered workqueues */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); mutex_lock(&wq->mutex); + wq->flags &= ~__WQ_ORDERED; wq->saved_max_active = max_active; for_each_pwq(pwq, wq) @@ -5263,7 +5268,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) * attributes breaks ordering guarantee. Disallow exposing ordered * workqueues. */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return -EINVAL; wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); -- cgit v1.1 From 8397913303abc9333f376a518a8368fa22ca5e6e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 27 Jul 2017 12:21:11 +0200 Subject: genirq/cpuhotplug: Revert "Set force affinity flag on hotplug migration" That commit was part of the changes moving x86 to the generic CPU hotplug interrupt migration code. The force flag was required on x86 before the hierarchical irqdomain rework, but invoking set_affinity() with force=true stayed and had no side effects. At some point in the past, the force flag got repurposed to support the exynos timer interrupt affinity setting to a not yet online CPU, so the interrupt controller callback does not verify the supplied affinity mask against cpu_online_mask. Setting the flag in the CPU hotplug code causes the cpu online masking to be blocked on these irq controllers and results in potentially affining an interrupt to the CPU which is unplugged, i.e. instead of moving it away, it's just reassigned to it. As the force flags is not longer needed on x86, it's safe to revert that patch so the ARM irqchips which use the force flag work again. Add comments to that effect, so this won't happen again. Note: The online mask handling should be done in the generic code and the force flag and the masking in the irq chips removed all together, but that's not a change possible for 4.13. Fixes: 77f85e66aa8b ("genirq/cpuhotplug: Set force affinity flag on hotplug migration") Reported-by: Will Deacon Signed-off-by: Thomas Gleixner Acked-by: Will Deacon Cc: Marc Zyngier Cc: Russell King Cc: LAK Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1707271217590.3109@nanos Signed-off-by: Thomas Gleixner --- kernel/irq/cpuhotplug.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index aee8f7e..638eb9c 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -95,8 +95,13 @@ static bool migrate_one_irq(struct irq_desc *desc) affinity = cpu_online_mask; brokeaff = true; } - - err = irq_do_set_affinity(d, affinity, true); + /* + * Do not set the force argument of irq_do_set_affinity() as this + * disables the masking of offline CPUs from the supplied affinity + * mask and therefore might keep/reassign the irq to the outgoing + * CPU. + */ + err = irq_do_set_affinity(d, affinity, false); if (err) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, err); -- cgit v1.1 From 1ad0f0a7aa1bf3bd42dcd108a96713d255eacd9f Mon Sep 17 00:00:00 2001 From: Michael Bringmann Date: Thu, 27 Jul 2017 16:27:14 -0500 Subject: workqueue: Work around edge cases for calc of pool's cpumask There is an underlying assumption/trade-off in many layers of the Linux system that CPU <-> node mapping is static. This is despite the presence of features like NUMA and 'hotplug' that support the dynamic addition/ removal of fundamental system resources like CPUs and memory. PowerPC systems, however, do provide extensive features for the dynamic change of resources available to a system. Currently, there is little or no synchronization protection around the updating of the CPU <-> node mapping, and the export/update of this information for other layers / modules. In systems which can change this mapping during 'hotplug', like PowerPC, the information is changing underneath all layers that might reference it. This patch attempts to ensure that a valid, usable cpumask attribute is used by the workqueue infrastructure when setting up new resource pools. It prevents a crash that has been observed when an 'empty' cpumask is passed along to the worker/task scheduling code. It is intended as a temporary workaround until a more fundamental review and correction of the issue can be done. [With additions to the patch provided by Tejun Hao ] Signed-off-by: Michael Bringmann Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7146ea7..ca937b0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3577,6 +3577,13 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, /* yeap, return possible CPUs in @node that @attrs wants */ cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); + + if (cpumask_empty(cpumask)) { + pr_warn_once("WARNING: workqueue cpumask: online intersect > " + "possible intersect\n"); + return false; + } + return !cpumask_equal(cpumask, attrs->cpumask); use_dfl: -- cgit v1.1 From 89b096898a8450b0a5b97d521e000ae9f94f81f9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 27 Jul 2017 21:02:46 +0200 Subject: bpf: don't indicate success when copy_from_user fails err in bpf_prog_get_info_by_fd() still holds 0 at that time from prior check_uarg_tail_zero() check. Explicitly return -EFAULT instead, so user space can be notified of buggy behavior. Fixes: 1e2709769086 ("bpf: Add BPF_OBJ_GET_INFO_BY_FD") Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 045646d..84bb399 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1289,7 +1289,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info_len = min_t(u32, sizeof(info), info_len); if (copy_from_user(&info, uinfo, info_len)) - return err; + return -EFAULT; info.type = prog->type; info.id = prog->aux->id; -- cgit v1.1 From 9975a54b3c9ecf029cbf5dd7a8c9701b1d74029e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 28 Jul 2017 17:05:25 +0200 Subject: bpf: fix bpf_prog_get_info_by_fd to dump correct xlated_prog_len bpf_prog_size(prog->len) is not the correct length we want to dump back to user space. The code in bpf_prog_get_info_by_fd() uses this to copy prog->insnsi to user space, but bpf_prog_size(prog->len) also includes the size of struct bpf_prog itself plus program instructions and is usually used either in context of accounting or for bpf_prog_alloc() et al, thus we copy out of bounds in bpf_prog_get_info_by_fd() potentially. Use the correct bpf_prog_insn_size() instead. Fixes: 1e2709769086 ("bpf: Add BPF_OBJ_GET_INFO_BY_FD") Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 84bb399..6c772ad 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1312,7 +1312,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } ulen = info.xlated_prog_len; - info.xlated_prog_len = bpf_prog_size(prog->len); + info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { uinsns = u64_to_user_ptr(info.xlated_prog_insns); ulen = min_t(u32, info.xlated_prog_len, ulen); -- cgit v1.1 From 34f41c0316ed52b0b44542491d89278efdaa70e4 Mon Sep 17 00:00:00 2001 From: Matija Glavinic Pecotic Date: Tue, 1 Aug 2017 09:11:52 +0200 Subject: timers: Fix overflow in get_next_timer_interrupt For e.g. HZ=100, timer being 430 jiffies in the future, and 32 bit unsigned int, there is an overflow on unsigned int right-hand side of the expression which results with wrong values being returned. Type cast the multiplier to 64bit to avoid that issue. Fixes: 46c8f0b077a8 ("timers: Fix get_next_timer_interrupt() computation") Signed-off-by: Matija Glavinic Pecotic Signed-off-by: Thomas Gleixner Reviewed-by: Alexander Sverdlin Cc: khilman@baylibre.com Cc: akpm@linux-foundation.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/a7900f04-2a21-c9fd-67be-ab334d459ee5@nokia.com --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 71ce3f4..8f5d1bf 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1495,7 +1495,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) base->is_idle = false; } else { if (!is_max_delta) - expires = basem + (nextevt - basej) * TICK_NSEC; + expires = basem + (u64)(nextevt - basej) * TICK_NSEC; /* * If we expect to sleep more than a tick, mark the base idle: */ -- cgit v1.1 From 4bb0f0e73c8c30917d169c4a0f1ac083690c545b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 1 Aug 2017 12:01:52 -0400 Subject: tracing: Call clear_boot_tracer() at lateinit_sync The clear_boot_tracer function is used to reset the default_bootup_tracer string to prevent it from being accessed after boot, as it originally points to init data. But since clear_boot_tracer() is called via the init_lateinit() call, it races with the initcall for registering the hwlat tracer. If someone adds "ftrace=hwlat" to the kernel command line, depending on how the linker sets up the text, the saved command line may be cleared, and the hwlat tracer never is initialized. Simply have the clear_boot_tracer() be called by initcall_lateinit_sync() as that's for tasks to be called after lateinit. Link: https://bugzilla.kernel.org/show_bug.cgi?id=196551 Cc: stable@vger.kernel.org Fixes: e7c15cd8a ("tracing: Added hardware latency tracer") Reported-by: Zamir SUN Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42b9355..784fb43 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8407,4 +8407,4 @@ __init static int clear_boot_tracer(void) } fs_initcall(tracer_init_tracefs); -late_initcall(clear_boot_tracer); +late_initcall_sync(clear_boot_tracer); -- cgit v1.1 From 147d88e0b5eb90191bc5c12ca0a3c410b75a13d2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 1 Aug 2017 14:02:01 +0300 Subject: tracing: Missing error code in tracer_alloc_buffers() If ring_buffer_alloc() or one of the next couple function calls fail then we should return -ENOMEM but the current code returns success. Link: http://lkml.kernel.org/r/20170801110201.ajdkct7vwzixahvx@mwanda Cc: Sebastian Andrzej Siewior Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: b32614c03413 ('tracing/rb: Convert to hotplug state machine') Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 784fb43..d815fc3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8293,6 +8293,7 @@ __init static int tracer_alloc_buffers(void) if (ret < 0) goto out_free_cpumask; /* Used for event triggers */ + ret = -ENOMEM; temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); if (!temp_buffer) goto out_rm_hp_state; -- cgit v1.1 From a7e52ad7ed82e21273eccff93d1477a7b313aabb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 2 Aug 2017 14:20:54 -0400 Subject: ring-buffer: Have ring_buffer_alloc_read_page() return error on offline CPU Chunyu Hu reported: "per_cpu trace directories and files are created for all possible cpus, but only the cpus which have ever been on-lined have their own per cpu ring buffer (allocated by cpuhp threads). While trace_buffers_open, the open handler for trace file 'trace_pipe_raw' is always trying to access field of ring_buffer_per_cpu, and would panic with the NULL pointer. Align the behavior of trace_pipe_raw with trace_pipe, that returns -NODEV when openning it if that cpu does not have trace ring buffer. Reproduce: cat /sys/kernel/debug/tracing/per_cpu/cpu31/trace_pipe_raw (cpu31 is never on-lined, this is a 16 cores x86_64 box) Tested with: 1) boot with maxcpus=14, read trace_pipe_raw of cpu15. Got -NODEV. 2) oneline cpu15, read trace_pipe_raw of cpu15. Get the raw trace data. Call trace: [ 5760.950995] RIP: 0010:ring_buffer_alloc_read_page+0x32/0xe0 [ 5760.961678] tracing_buffers_read+0x1f6/0x230 [ 5760.962695] __vfs_read+0x37/0x160 [ 5760.963498] ? __vfs_read+0x5/0x160 [ 5760.964339] ? security_file_permission+0x9d/0xc0 [ 5760.965451] ? __vfs_read+0x5/0x160 [ 5760.966280] vfs_read+0x8c/0x130 [ 5760.967070] SyS_read+0x55/0xc0 [ 5760.967779] do_syscall_64+0x67/0x150 [ 5760.968687] entry_SYSCALL64_slow_path+0x25/0x25" This was introduced by the addition of the feature to reuse reader pages instead of re-allocating them. The problem is that the allocation of a reader page (which is per cpu) does not check if the cpu is online and set up for the ring buffer. Link: http://lkml.kernel.org/r/1500880866-1177-1-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: 73a757e63114 ("ring-buffer: Return reader page back into existing ring buffer") Reported-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 14 +++++++++----- kernel/trace/ring_buffer_benchmark.c | 2 +- kernel/trace/trace.c | 16 +++++++++++----- 3 files changed, 21 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 529cc50..81279c6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4386,15 +4386,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); * the page that was allocated, with the read page of the buffer. * * Returns: - * The page allocated, or NULL on error. + * The page allocated, or ERR_PTR */ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) { - struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_page *bpage = NULL; unsigned long flags; struct page *page; + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return ERR_PTR(-ENODEV); + + cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); @@ -4412,7 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); if (!page) - return NULL; + return ERR_PTR(-ENOMEM); bpage = page_address(page); @@ -4467,8 +4471,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * * for example: * rpage = ring_buffer_alloc_read_page(buffer, cpu); - * if (!rpage) - * return error; + * if (IS_ERR(rpage)) + * return PTR_ERR(rpage); * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); * if (ret >= 0) * process_page(rpage, ret); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 9fbcaf5..68ee79a 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -113,7 +113,7 @@ static enum event_status read_page(int cpu) int i; bpage = ring_buffer_alloc_read_page(buffer, cpu); - if (!bpage) + if (IS_ERR(bpage)) return EVENT_DROPPED; ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d815fc3..44004d8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6598,7 +6598,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; - ssize_t ret; + ssize_t ret = 0; ssize_t size; if (!count) @@ -6612,10 +6612,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, iter->cpu_file); - info->spare_cpu = iter->cpu_file; + if (IS_ERR(info->spare)) { + ret = PTR_ERR(info->spare); + info->spare = NULL; + } else { + info->spare_cpu = iter->cpu_file; + } } if (!info->spare) - return -ENOMEM; + return ret; /* Do we have previous read data to read? */ if (info->read < PAGE_SIZE) @@ -6790,8 +6795,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, ref->ref = 1; ref->buffer = iter->trace_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); - if (!ref->page) { - ret = -ENOMEM; + if (IS_ERR(ref->page)) { + ret = PTR_ERR(ref->page); + ref->page = NULL; kfree(ref); break; } -- cgit v1.1 From 27e37d84e55f47bf28f7072fadf8fa2cbc3d9375 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 2 Aug 2017 13:31:50 -0700 Subject: pid: kill pidhash_size in pidhash_init() After commit 3d375d78593c ("mm: update callers to use HASH_ZERO flag"), drop unused pidhash_size in pidhash_init(). Link: http://lkml.kernel.org/r/1500389267-49222-1-git-send-email-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 731c4e5..c69c30d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -575,13 +575,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - unsigned int pidhash_size; - pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, HASH_EARLY | HASH_SMALL | HASH_ZERO, &pidhash_shift, NULL, 0, 4096); - pidhash_size = 1U << pidhash_shift; } void __init pidmap_init(void) -- cgit v1.1 From 89affbf5d9ebb15c6460596822e8857ea2f9e735 Mon Sep 17 00:00:00 2001 From: Dima Zavin Date: Wed, 2 Aug 2017 13:32:18 -0700 Subject: cpuset: fix a deadlock due to incomplete patching of cpusets_enabled() In codepaths that use the begin/retry interface for reading mems_allowed_seq with irqs disabled, there exists a race condition that stalls the patch process after only modifying a subset of the static_branch call sites. This problem manifested itself as a deadlock in the slub allocator, inside get_any_partial. The loop reads mems_allowed_seq value (via read_mems_allowed_begin), performs the defrag operation, and then verifies the consistency of mem_allowed via the read_mems_allowed_retry and the cookie returned by xxx_begin. The issue here is that both begin and retry first check if cpusets are enabled via cpusets_enabled() static branch. This branch can be rewritted dynamically (via cpuset_inc) if a new cpuset is created. The x86 jump label code fully synchronizes across all CPUs for every entry it rewrites. If it rewrites only one of the callsites (specifically the one in read_mems_allowed_retry) and then waits for the smp_call_function(do_sync_core) to complete while a CPU is inside the begin/retry section with IRQs off and the mems_allowed value is changed, we can hang. This is because begin() will always return 0 (since it wasn't patched yet) while retry() will test the 0 against the actual value of the seq counter. The fix is to use two different static keys: one for begin (pre_enable_key) and one for retry (enable_key). In cpuset_inc(), we first bump the pre_enable key to ensure that cpuset_mems_allowed_begin() always return a valid seqcount if are enabling cpusets. Similarly, when disabling cpusets via cpuset_dec(), we first ensure that callers of cpuset_mems_allowed_retry() will start ignoring the seqcount value before we let cpuset_mems_allowed_begin() return 0. The relevant stack traces of the two stuck threads: CPU: 1 PID: 1415 Comm: mkdir Tainted: G L 4.9.36-00104-g540c51286237 #4 Hardware name: Default string Default string/Hardware, BIOS 4.29.1-20170526215256 05/26/2017 task: ffff8817f9c28000 task.stack: ffffc9000ffa4000 RIP: smp_call_function_many+0x1f9/0x260 Call Trace: smp_call_function+0x3b/0x70 on_each_cpu+0x2f/0x90 text_poke_bp+0x87/0xd0 arch_jump_label_transform+0x93/0x100 __jump_label_update+0x77/0x90 jump_label_update+0xaa/0xc0 static_key_slow_inc+0x9e/0xb0 cpuset_css_online+0x70/0x2e0 online_css+0x2c/0xa0 cgroup_apply_control_enable+0x27f/0x3d0 cgroup_mkdir+0x2b7/0x420 kernfs_iop_mkdir+0x5a/0x80 vfs_mkdir+0xf6/0x1a0 SyS_mkdir+0xb7/0xe0 entry_SYSCALL_64_fastpath+0x18/0xad ... CPU: 2 PID: 1 Comm: init Tainted: G L 4.9.36-00104-g540c51286237 #4 Hardware name: Default string Default string/Hardware, BIOS 4.29.1-20170526215256 05/26/2017 task: ffff8818087c0000 task.stack: ffffc90000030000 RIP: int3+0x39/0x70 Call Trace: <#DB> ? ___slab_alloc+0x28b/0x5a0 ? copy_process.part.40+0xf7/0x1de0 __slab_alloc.isra.80+0x54/0x90 copy_process.part.40+0xf7/0x1de0 copy_process.part.40+0xf7/0x1de0 kmem_cache_alloc_node+0x8a/0x280 copy_process.part.40+0xf7/0x1de0 _do_fork+0xe7/0x6c0 _raw_spin_unlock_irq+0x2d/0x60 trace_hardirqs_on_caller+0x136/0x1d0 entry_SYSCALL_64_fastpath+0x5/0xad do_syscall_64+0x27/0x350 SyS_clone+0x19/0x20 do_syscall_64+0x60/0x350 entry_SYSCALL64_slow_path+0x25/0x25 Link: http://lkml.kernel.org/r/20170731040113.14197-1-dmitriyz@waymo.com Fixes: 46e700abc44c ("mm, page_alloc: remove unnecessary taking of a seqlock when cpusets are disabled") Signed-off-by: Dima Zavin Reported-by: Cliff Spradlin Acked-by: Vlastimil Babka Cc: Peter Zijlstra Cc: Christopher Lameter Cc: Li Zefan Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cpuset.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ca8376e..8d51516 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -63,6 +63,7 @@ #include #include +DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* See "Frequency meter" comments, below. */ -- cgit v1.1 From fbb77611e95d3d5b2af86a59754a3130877cb667 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Sat, 5 Aug 2017 23:00:50 +0300 Subject: Fix compat_sys_sigpending breakage The latest change of compat_sys_sigpending in commit 8f13621abced ("sigpending(): move compat to native") has broken it in two ways. First, it tries to write 4 bytes more than userspace expects: sizeof(old_sigset_t) == sizeof(long) == 8 instead of sizeof(compat_old_sigset_t) == sizeof(u32) == 4. Second, on big endian architectures these bytes are being written in the wrong order. This bug was found by strace test suite. Reported-by: Anatoly Pugachev Inspired-by: Eugene Syromyatnikov Fixes: 8f13621abced ("sigpending(): move compat to native") Signed-off-by: Dmitry V. Levin Acked-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/signal.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index caed913..7e33f8c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3303,12 +3303,15 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { +#ifdef __BIG_ENDIAN sigset_t set; - int err = do_sigpending(&set, sizeof(old_sigset_t)); - if (err == 0) - if (copy_to_user(set32, &set, sizeof(old_sigset_t))) - err = -EFAULT; + int err = do_sigpending(&set, sizeof(set.sig[0])); + if (!err) + err = put_user(set.sig[0], set32); return err; +#else + return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32)); +#endif } #endif -- cgit v1.1 From 48fb6f4db940e92cfb16cd878cddd59ea6120d06 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 9 Aug 2017 08:27:11 +0100 Subject: futex: Remove unnecessary warning from get_futex_key Commit 65d8fc777f6d ("futex: Remove requirement for lock_page() in get_futex_key()") removed an unnecessary lock_page() with the side-effect that page->mapping needed to be treated very carefully. Two defensive warnings were added in case any assumption was missed and the first warning assumed a correct application would not alter a mapping backing a futex key. Since merging, it has not triggered for any unexpected case but Mark Rutland reported the following bug triggering due to the first warning. kernel BUG at kernel/futex.c:679! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 3695 Comm: syz-executor1 Not tainted 4.13.0-rc3-00020-g307fec773ba3 #3 Hardware name: linux,dummy-virt (DT) task: ffff80001e271780 task.stack: ffff000010908000 PC is at get_futex_key+0x6a4/0xcf0 kernel/futex.c:679 LR is at get_futex_key+0x6a4/0xcf0 kernel/futex.c:679 pc : [] lr : [] pstate: 80000145 The fact that it's a bug instead of a warning was due to an unrelated arm64 problem, but the warning itself triggered because the underlying mapping changed. This is an application issue but from a kernel perspective it's a recoverable situation and the warning is unnecessary so this patch removes the warning. The warning may potentially be triggered with the following test program from Mark although it may be necessary to adjust NR_FUTEX_THREADS to be a value smaller than the number of CPUs in the system. #include #include #include #include #include #include #include #include #define NR_FUTEX_THREADS 16 pthread_t threads[NR_FUTEX_THREADS]; void *mem; #define MEM_PROT (PROT_READ | PROT_WRITE) #define MEM_SIZE 65536 static int futex_wrapper(int *uaddr, int op, int val, const struct timespec *timeout, int *uaddr2, int val3) { syscall(SYS_futex, uaddr, op, val, timeout, uaddr2, val3); } void *poll_futex(void *unused) { for (;;) { futex_wrapper(mem, FUTEX_CMP_REQUEUE_PI, 1, NULL, mem + 4, 1); } } int main(int argc, char *argv[]) { int i; mem = mmap(NULL, MEM_SIZE, MEM_PROT, MAP_SHARED | MAP_ANONYMOUS, -1, 0); printf("Mapping @ %p\n", mem); printf("Creating futex threads...\n"); for (i = 0; i < NR_FUTEX_THREADS; i++) pthread_create(&threads[i], NULL, poll_futex, NULL); printf("Flipping mapping...\n"); for (;;) { mmap(mem, MEM_SIZE, MEM_PROT, MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, -1, 0); } return 0; } Reported-and-tested-by: Mark Rutland Signed-off-by: Mel Gorman Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org # 4.7+ Signed-off-by: Linus Torvalds --- kernel/futex.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 16dbe4c..f50b434 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -670,13 +670,14 @@ again: * this reference was taken by ihold under the page lock * pinning the inode in place so i_lock was unnecessary. The * only way for this check to fail is if the inode was - * truncated in parallel so warn for now if this happens. + * truncated in parallel which is almost certainly an + * application bug. In such a case, just retry. * * We are not calling into get_futex_key_refs() in file-backed * cases, therefore a successful atomic_inc return below will * guarantee that get_futex_key() will still imply smp_mb(); (B). */ - if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) { + if (!atomic_inc_not_zero(&inode->i_count)) { rcu_read_unlock(); put_page(page); -- cgit v1.1 From bfe334924ccd9f4a53f30240c03cf2f43f5b2df1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Aug 2017 19:39:30 +0200 Subject: perf/x86: Fix RDPMC vs. mm_struct tracking Vince reported the following rdpmc() testcase failure: > Failing test case: > > fd=perf_event_open(); > addr=mmap(fd); > exec() // without closing or unmapping the event > fd=perf_event_open(); > addr=mmap(fd); > rdpmc() // GPFs due to rdpmc being disabled The problem is of course that exec() plays tricks with what is current->mm, only destroying the old mappings after having installed the new mm. Fix this confusion by passing along vma->vm_mm instead of relying on current->mm. Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 1e0fb9ec679c ("perf: Add pmu callbacks to track event mapping and unmapping") Link: http://lkml.kernel.org/r/20170802173930.cstykcqefmqt7jau@hirez.programming.kicks-ass.net [ Minor cleanups. ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 426c2ff..a654b8a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5090,7 +5090,7 @@ static void perf_mmap_open(struct vm_area_struct *vma) atomic_inc(&event->rb->aux_mmap_count); if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); @@ -5113,7 +5113,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) unsigned long size = perf_data_size(rb); if (event->pmu->event_unmapped) - event->pmu->event_unmapped(event); + event->pmu->event_unmapped(event, vma->vm_mm); /* * rb->aux_mmap_count will always drop before rb->mmap_count and @@ -5411,7 +5411,7 @@ aux_unlock: vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); return ret; } -- cgit v1.1 From 9b231d9f47c6114d317ce28cff92a74ad80547f5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Aug 2017 15:42:09 +0200 Subject: perf/core: Fix time on IOC_ENABLE Vince reported that when we do IOC_ENABLE/IOC_DISABLE while the task is SIGSTOP'ed state the timestamps go wobbly. It turns out we indeed fail to correctly account time while in 'OFF' state and doing IOC_ENABLE without getting scheduled in exposes the problem. Further thinking about this problem, it occurred to me that we can suffer a similar fate when we migrate an uncore event between CPUs. The perf_event_install() on the 'new' CPU will do add_event_to_ctx() which will reset all the time stamp, resulting in a subsequent update_event_times() to overwrite the total_time_* fields with smaller values. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a654b8a..ee20d4c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2217,6 +2217,33 @@ static int group_can_go_on(struct perf_event *event, return can_add_hw; } +/* + * Complement to update_event_times(). This computes the tstamp_* values to + * continue 'enabled' state from @now, and effectively discards the time + * between the prior tstamp_stopped and now (as we were in the OFF state, or + * just switched (context) time base). + * + * This further assumes '@event->state == INACTIVE' (we just came from OFF) and + * cannot have been scheduled in yet. And going into INACTIVE state means + * '@event->tstamp_stopped = @now'. + * + * Thus given the rules of update_event_times(): + * + * total_time_enabled = tstamp_stopped - tstamp_enabled + * total_time_running = tstamp_stopped - tstamp_running + * + * We can insert 'tstamp_stopped == now' and reverse them to compute new + * tstamp_* values. + */ +static void __perf_event_enable_time(struct perf_event *event, u64 now) +{ + WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE); + + event->tstamp_stopped = now; + event->tstamp_enabled = now - event->total_time_enabled; + event->tstamp_running = now - event->total_time_running; +} + static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { @@ -2224,9 +2251,12 @@ static void add_event_to_ctx(struct perf_event *event, list_add_event(event, ctx); perf_group_attach(event); - event->tstamp_enabled = tstamp; - event->tstamp_running = tstamp; - event->tstamp_stopped = tstamp; + /* + * We can be called with event->state == STATE_OFF when we create with + * .disabled = 1. In that case the IOC_ENABLE will call this function. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) + __perf_event_enable_time(event, tstamp); } static void ctx_sched_out(struct perf_event_context *ctx, @@ -2471,10 +2501,11 @@ static void __perf_event_mark_enabled(struct perf_event *event) u64 tstamp = perf_event_time(event); event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = tstamp - event->total_time_enabled; + __perf_event_enable_time(event, tstamp); list_for_each_entry(sub, &event->sibling_list, group_entry) { + /* XXX should not be > INACTIVE if event isn't */ if (sub->state >= PERF_EVENT_STATE_INACTIVE) - sub->tstamp_enabled = tstamp - sub->total_time_enabled; + __perf_event_enable_time(sub, tstamp); } } -- cgit v1.1 From d507e2ebd2c7be9138e5cf5c0cb1931c90c42ab1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 10 Aug 2017 15:23:31 -0700 Subject: mm: fix global NR_SLAB_.*CLAIMABLE counter reads As Tetsuo points out: "Commit 385386cff4c6 ("mm: vmstat: move slab statistics from zone to node counters") broke "Slab:" field of /proc/meminfo . It shows nearly 0kB" In addition to /proc/meminfo, this problem also affects the slab counters OOM/allocation failure info dumps, can cause early -ENOMEM from overcommit protection, and miscalculate image size requirements during suspend-to-disk. This is because the patch in question switched the slab counters from the zone level to the node level, but forgot to update the global accessor functions to read the aggregate node data instead of the aggregate zone data. Use global_node_page_state() to access the global slab counters. Fixes: 385386cff4c6 ("mm: vmstat: move slab statistics from zone to node counters") Link: http://lkml.kernel.org/r/20170801134256.5400-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Tetsuo Handa Acked-by: Michal Hocko Cc: Josef Bacik Cc: Vladimir Davydov Cc: Stefan Agner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 22231772..0972a8e 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1650,7 +1650,7 @@ static unsigned long minimum_image_size(unsigned long saveable) { unsigned long size; - size = global_page_state(NR_SLAB_RECLAIMABLE) + size = global_node_page_state(NR_SLAB_RECLAIMABLE) + global_node_page_state(NR_ACTIVE_ANON) + global_node_page_state(NR_INACTIVE_ANON) + global_node_page_state(NR_ACTIVE_FILE) -- cgit v1.1 From 16af97dc5a8975371a83d9e30a64038b48f40a2d Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 10 Aug 2017 15:23:56 -0700 Subject: mm: migrate: prevent racy access to tlb_flush_pending Patch series "fixes of TLB batching races", v6. It turns out that Linux TLB batching mechanism suffers from various races. Races that are caused due to batching during reclamation were recently handled by Mel and this patch-set deals with others. The more fundamental issue is that concurrent updates of the page-tables allow for TLB flushes to be batched on one core, while another core changes the page-tables. This other core may assume a PTE change does not require a flush based on the updated PTE value, while it is unaware that TLB flushes are still pending. This behavior affects KSM (which may result in memory corruption) and MADV_FREE and MADV_DONTNEED (which may result in incorrect behavior). A proof-of-concept can easily produce the wrong behavior of MADV_DONTNEED. Memory corruption in KSM is harder to produce in practice, but was observed by hacking the kernel and adding a delay before flushing and replacing the KSM page. Finally, there is also one memory barrier missing, which may affect architectures with weak memory model. This patch (of 7): Setting and clearing mm->tlb_flush_pending can be performed by multiple threads, since mmap_sem may only be acquired for read in task_numa_work(). If this happens, tlb_flush_pending might be cleared while one of the threads still changes PTEs and batches TLB flushes. This can lead to the same race between migration and change_protection_range() that led to the introduction of tlb_flush_pending. The result of this race was data corruption, which means that this patch also addresses a theoretically possible data corruption. An actual data corruption was not observed, yet the race was was confirmed by adding assertion to check tlb_flush_pending is not set by two threads, adding artificial latency in change_protection_range() and using sysctl to reduce kernel.numa_balancing_scan_delay_ms. Link: http://lkml.kernel.org/r/20170802000818.4760-2-namit@vmware.com Fixes: 20841405940e ("mm: fix TLB flush race between migration, and change_protection_range") Signed-off-by: Nadav Amit Acked-by: Mel Gorman Acked-by: Rik van Riel Acked-by: Minchan Kim Cc: Andy Lutomirski Cc: Hugh Dickins Cc: "David S. Miller" Cc: Andrea Arcangeli Cc: Heiko Carstens Cc: Ingo Molnar Cc: Jeff Dike Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Russell King Cc: Sergey Senozhatsky Cc: Tony Luck Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 17921b0..e075b77 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -807,7 +807,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_aio(mm); mm_init_owner(mm, p); mmu_notifier_mm_init(mm); - clear_tlb_flush_pending(mm); + init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif -- cgit v1.1 From d76036ab47eafa6ce52b69482e91ca3ba337d6d6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 15 Aug 2017 13:00:36 +0200 Subject: audit: Fix use after free in audit_remove_watch_rule() audit_remove_watch_rule() drops watch's reference to parent but then continues to work with it. That is not safe as parent can get freed once we drop our reference. The following is a trivial reproducer: mount -o loop image /mnt touch /mnt/file auditctl -w /mnt/file -p wax umount /mnt auditctl -D Grab our own reference in audit_remove_watch_rule() earlier to make sure mark does not get freed under us. CC: stable@vger.kernel.org Reported-by: Tony Jones Signed-off-by: Jan Kara Tested-by: Tony Jones Signed-off-by: Paul Moore --- kernel/audit_watch.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e0656bd..1c7ded4 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule) list_del(&krule->rlist); if (list_empty(&watch->rules)) { + /* + * audit_remove_watch() drops our reference to 'parent' which + * can get freed. Grab our own reference to be safe. + */ + audit_get_parent(parent); audit_remove_watch(watch); - - if (list_empty(&parent->watches)) { - audit_get_parent(parent); + if (list_empty(&parent->watches)) fsnotify_destroy_mark(&parent->mark, audit_watch_group); - audit_put_parent(parent); - } + audit_put_parent(parent); } } -- cgit v1.1 From b5fed474b98332559f2590c6bc90388a0899e793 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 15 Aug 2017 13:00:37 +0200 Subject: audit: Receive unmount event Although audit_watch_handle_event() can handle FS_UNMOUNT event, it is not part of AUDIT_FS_WATCH mask and thus such event never gets to audit_watch_handle_event(). Thus fsnotify marks are deleted by fsnotify subsystem on unmount without audit being notified about that which leads to a strange state of existing audit rules with dead fsnotify marks. Add FS_UNMOUNT to the mask of events to be received so that audit can clean up its state accordingly. Signed-off-by: Jan Kara Signed-off-by: Paul Moore --- kernel/audit_watch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 1c7ded4..d1b5857 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -66,7 +66,7 @@ static struct fsnotify_group *audit_watch_group; /* fsnotify events we care about. */ #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ - FS_MOVE_SELF | FS_EVENT_ON_CHILD) + FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT) static void audit_free_parent(struct audit_parent *parent) { -- cgit v1.1 From 88a5c690b66110ad255380d8f629c629cf6ca559 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 16 Aug 2017 01:45:33 +0200 Subject: bpf: fix bpf_trace_printk on 32 bit archs James reported that on MIPS32 bpf_trace_printk() is currently broken while MIPS64 works fine: bpf_trace_printk() uses conditional operators to attempt to pass different types to __trace_printk() depending on the format operators. This doesn't work as intended on 32-bit architectures where u32 and long are passed differently to u64, since the result of C conditional operators follows the "usual arithmetic conversions" rules, such that the values passed to __trace_printk() will always be u64 [causing issues later in the va_list handling for vscnprintf()]. For example the samples/bpf/tracex5 test printed lines like below on MIPS32, where the fd and buf have come from the u64 fd argument, and the size from the buf argument: [...] 1180.941542: 0x00000001: write(fd=1, buf= (null), size=6258688) Instead of this: [...] 1625.616026: 0x00000001: write(fd=1, buf=009e4000, size=512) One way to get it working is to expand various combinations of argument types into 8 different combinations for 32 bit and 64 bit kernels. Fix tested by James on MIPS32 and MIPS64 as well that it resolves the issue. Fixes: 9c959c863f82 ("tracing: Allow BPF programs to call bpf_trace_printk()") Reported-by: James Hogan Tested-by: James Hogan Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3738519..dc498b6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -204,10 +204,36 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, fmt_cnt++; } - return __trace_printk(1/* fake ip will not be printed */, fmt, - mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1, - mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2, - mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3); +/* Horrid workaround for getting va_list handling working with different + * argument type combinations generically for 32 and 64 bit archs. + */ +#define __BPF_TP_EMIT() __BPF_ARG3_TP() +#define __BPF_TP(...) \ + __trace_printk(1 /* Fake ip will not be printed. */, \ + fmt, ##__VA_ARGS__) + +#define __BPF_ARG1_TP(...) \ + ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_TP(arg1, ##__VA_ARGS__) \ + : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_TP((long)arg1, ##__VA_ARGS__) \ + : __BPF_TP((u32)arg1, ##__VA_ARGS__))) + +#define __BPF_ARG2_TP(...) \ + ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \ + : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \ + : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__))) + +#define __BPF_ARG3_TP(...) \ + ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \ + : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \ + : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__))) + + return __BPF_TP_EMIT(); } static const struct bpf_func_proto bpf_trace_printk_proto = { -- cgit v1.1 From e8f241893dfbbebe2813c01eac54f263e6a5e59c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 18 Aug 2017 10:53:45 +0100 Subject: genirq: Restore trigger settings in irq_modify_status() irq_modify_status starts by clearing the trigger settings from irq_data before applying the new settings, but doesn't restore them, leaving them to IRQ_TYPE_NONE. That's pretty confusing to the potential request_irq() that could follow. Instead, snapshot the settings before clearing them, and restore them if the irq_modify_status() invocation was not changing the trigger. Fixes: 1e2a7d78499e ("irqdomain: Don't set type when mapping an IRQ") Reported-and-tested-by: jeffy Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: Jon Hunter Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170818095345.12378-1-marc.zyngier@arm.com --- kernel/irq/chip.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a3cc37c..3675c60 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1000,7 +1000,7 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { - unsigned long flags; + unsigned long flags, trigger, tmp; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) @@ -1014,6 +1014,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irq_settings_clr_and_set(desc, clr, set); + trigger = irqd_get_trigger_type(&desc->irq_data); + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); if (irq_settings_has_no_balance_set(desc)) @@ -1025,7 +1027,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) if (irq_settings_is_level(desc)) irqd_set(&desc->irq_data, IRQD_LEVEL); - irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); + tmp = irq_settings_get_trigger_mask(desc); + if (tmp != IRQ_TYPE_NONE) + trigger = tmp; + + irqd_set(&desc->irq_data, trigger); irq_put_desc_unlock(desc, flags); } -- cgit v1.1 From 7edaeb6841dfb27e362288ab8466ebdc4972e867 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 Aug 2017 09:50:13 +0200 Subject: kernel/watchdog: Prevent false positives with turbo modes The hardlockup detector on x86 uses a performance counter based on unhalted CPU cycles and a periodic hrtimer. The hrtimer period is about 2/5 of the performance counter period, so the hrtimer should fire 2-3 times before the performance counter NMI fires. The NMI code checks whether the hrtimer fired since the last invocation. If not, it assumess a hard lockup. The calculation of those periods is based on the nominal CPU frequency. Turbo modes increase the CPU clock frequency and therefore shorten the period of the perf/NMI watchdog. With extreme Turbo-modes (3x nominal frequency) the perf/NMI period is shorter than the hrtimer period which leads to false positives. A simple fix would be to shorten the hrtimer period, but that comes with the side effect of more frequent hrtimer and softlockup thread wakeups, which is not desired. Implement a low pass filter, which checks the perf/NMI period against kernel time. If the perf/NMI fires before 4/5 of the watchdog period has elapsed then the event is ignored and postponed to the next perf/NMI. That solves the problem and avoids the overhead of shorter hrtimer periods and more frequent softlockup thread wakeups. Fixes: 58687acba592 ("lockup_detector: Combine nmi_watchdog and softlockup detector") Reported-and-tested-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: dzickus@redhat.com Cc: prarit@redhat.com Cc: ak@linux.intel.com Cc: babu.moger@oracle.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: acme@redhat.com Cc: stable@vger.kernel.org Cc: atomlin@redhat.com Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1708150931310.1886@nanos --- kernel/watchdog.c | 1 + kernel/watchdog_hld.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 06d3389..f5d5202 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -240,6 +240,7 @@ static void set_sample_period(void) * hardlockup detector generates a warning */ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); + watchdog_update_hrtimer_threshold(sample_period); } /* Commands for resetting the watchdog */ diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 295a0d8..3a09ea1 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -37,6 +37,62 @@ void arch_touch_nmi_watchdog(void) } EXPORT_SYMBOL(arch_touch_nmi_watchdog); +#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP +static DEFINE_PER_CPU(ktime_t, last_timestamp); +static DEFINE_PER_CPU(unsigned int, nmi_rearmed); +static ktime_t watchdog_hrtimer_sample_threshold __read_mostly; + +void watchdog_update_hrtimer_threshold(u64 period) +{ + /* + * The hrtimer runs with a period of (watchdog_threshold * 2) / 5 + * + * So it runs effectively with 2.5 times the rate of the NMI + * watchdog. That means the hrtimer should fire 2-3 times before + * the NMI watchdog expires. The NMI watchdog on x86 is based on + * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles + * might run way faster than expected and the NMI fires in a + * smaller period than the one deduced from the nominal CPU + * frequency. Depending on the Turbo-Mode factor this might be fast + * enough to get the NMI period smaller than the hrtimer watchdog + * period and trigger false positives. + * + * The sample threshold is used to check in the NMI handler whether + * the minimum time between two NMI samples has elapsed. That + * prevents false positives. + * + * Set this to 4/5 of the actual watchdog threshold period so the + * hrtimer is guaranteed to fire at least once within the real + * watchdog threshold. + */ + watchdog_hrtimer_sample_threshold = period * 2; +} + +static bool watchdog_check_timestamp(void) +{ + ktime_t delta, now = ktime_get_mono_fast_ns(); + + delta = now - __this_cpu_read(last_timestamp); + if (delta < watchdog_hrtimer_sample_threshold) { + /* + * If ktime is jiffies based, a stalled timer would prevent + * jiffies from being incremented and the filter would look + * at a stale timestamp and never trigger. + */ + if (__this_cpu_inc_return(nmi_rearmed) < 10) + return false; + } + __this_cpu_write(nmi_rearmed, 0); + __this_cpu_write(last_timestamp, now); + return true; +} +#else +static inline bool watchdog_check_timestamp(void) +{ + return true; +} +#endif + static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -61,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event, return; } + if (!watchdog_check_timestamp()) + return; + /* check for a hardlockup * This is done by making sure our timer interrupt * is incrementing. The timer interrupt should have -- cgit v1.1 From 2ba293c9e7db150943f06b12d3eb7213e7fae624 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 18 Aug 2017 15:15:58 -0700 Subject: kmod: fix wait on recursive loop Recursive loops with module loading were previously handled in kmod by restricting the number of modprobe calls to 50 and if that limit was breached request_module() would return an error and a user would see the following on their kernel dmesg: request_module: runaway loop modprobe binfmt-464c Starting init:/sbin/init exists but couldn't execute it (error -8) This issue could happen for instance when a 64-bit kernel boots a 32-bit userspace on some architectures and has no 32-bit binary format hanlders. This is visible, for instance, when a CONFIG_MODULES enabled 64-bit MIPS kernel boots a into o32 root filesystem and the binfmt handler for o32 binaries is not built-in. After commit 6d7964a722af ("kmod: throttle kmod thread limit") we now don't have any visible signs of an error and the kernel just waits for the loop to end somehow. Although this *particular* recursive loop could also be addressed by doing a sanity check on search_binary_handler() and disallowing a modular binfmt to be required for modprobe, a generic solution for any recursive kernel kmod issues is still needed. This should catch these loops. We can investigate each loop and address each one separately as they come in, this however puts a stop gap for them as before. Link: http://lkml.kernel.org/r/20170809234635.13443-3-mcgrof@kernel.org Fixes: 6d7964a722af ("kmod: throttle kmod thread limit") Signed-off-by: Luis R. Rodriguez Reported-by: Matt Redfearn Tested-by: Matt Redfearn Cc: "Eric W. Biederman" Cc: Colin Ian King Cc: Dan Carpenter Cc: Daniel Mentz Cc: David Binderman Cc: Dmitry Torokhov Cc: Ingo Molnar Cc: Jessica Yu Cc: Josh Poimboeuf Cc: Kees Cook Cc: Michal Marek Cc: Miroslav Benes Cc: Peter Zijlstra (Intel) Cc: Petr Mladek Cc: Rusty Russell Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 6d016c5..2f37acd 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -71,6 +71,18 @@ static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); /* + * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads + * running at the same time without returning. When this happens we + * believe you've somehow ended up with a recursive module dependency + * creating a loop. + * + * We have no option but to fail. + * + * Userspace should proactively try to detect and prevent these. + */ +#define MAX_KMOD_ALL_BUSY_TIMEOUT 5 + +/* modprobe_path is set via /proc/sys. */ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; @@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...) pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", atomic_read(&kmod_concurrent_max), MAX_KMOD_CONCURRENT, module_name); - wait_event_interruptible(kmod_wq, - atomic_dec_if_positive(&kmod_concurrent_max) >= 0); + ret = wait_event_killable_timeout(kmod_wq, + atomic_dec_if_positive(&kmod_concurrent_max) >= 0, + MAX_KMOD_ALL_BUSY_TIMEOUT * HZ); + if (!ret) { + pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now", + module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT); + return -ETIME; + } else if (ret == -ERESTARTSYS) { + pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name); + return ret; + } } trace_module_request(module_name, wait, _RET_IP_); -- cgit v1.1 From eb61b5911bdc923875cde99eb25203a0e2b06d43 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Fri, 18 Aug 2017 15:16:18 -0700 Subject: signal: don't remove SIGNAL_UNKILLABLE for traced tasks. When forcing a signal, SIGNAL_UNKILLABLE is removed to prevent recursive faults, but this is undesirable when tracing. For example, debugging an init process (whether global or namespace), hitting a breakpoint and SIGTRAP will force SIGTRAP and then remove SIGNAL_UNKILLABLE. Everything continues fine, but then once debugging has finished, the init process is left killable which is unlikely what the user expects, resulting in either an accidentally killed init or an init that stops reaping zombies. Link: http://lkml.kernel.org/r/20170815112806.10728-1-jamie.iles@oracle.com Signed-off-by: Jamie Iles Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7e33f8c..ed804a4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) recalc_sigpending_and_wake(t); } } - if (action->sa.sa_handler == SIG_DFL) + /* + * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect + * debugging to leave init killable. + */ + if (action->sa.sa_handler == SIG_DFL && !t->ptrace) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = specific_send_sig_info(sig, info, t); spin_unlock_irqrestore(&t->sighand->siglock, flags); -- cgit v1.1 From 8fbbe2d7cc478d1544f41f2271787c993c23a4f6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 19 Aug 2017 12:57:51 +0300 Subject: genirq/ipi: Fixup checks against nr_cpu_ids Valid CPU ids are [0, nr_cpu_ids-1] inclusive. Fixes: 3b8e29a82dd1 ("genirq: Implement ipi_send_mask/single()") Fixes: f9bce791ae2a ("genirq: Add a new function to get IPI reverse mapping") Signed-off-by: Alexey Dobriyan Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170819095751.GB27864@avx2 --- kernel/irq/ipi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 1a9abc1..259a22a 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -165,7 +165,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) struct irq_data *data = irq_get_irq_data(irq); struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; - if (!data || !ipimask || cpu > nr_cpu_ids) + if (!data || !ipimask || cpu >= nr_cpu_ids) return INVALID_HWIRQ; if (!cpumask_test_cpu(cpu, ipimask)) @@ -195,7 +195,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, if (!chip->ipi_send_single && !chip->ipi_send_mask) return -EINVAL; - if (cpu > nr_cpu_ids) + if (cpu >= nr_cpu_ids) return -EINVAL; if (dest) { -- cgit v1.1 From dd1c1f2f2028a7b851f701fc6a8ebe39dcb95e7c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Aug 2017 17:35:02 +0200 Subject: pids: make task_tgid_nr_ns() safe This was reported many times, and this was even mentioned in commit 52ee2dfdd4f5 ("pids: refactor vnr/nr_ns helpers to make them safe") but somehow nobody bothered to fix the obvious problem: task_tgid_nr_ns() is not safe because task->group_leader points to nowhere after the exiting task passes exit_notify(), rcu_read_lock() can not help. We really need to change __unhash_process() to nullify group_leader, parent, and real_parent, but this needs some cleanups. Until then we can turn task_tgid_nr_ns() into another user of __task_pid_nr_ns() and fix the problem. Reported-by: Troy Kensinger Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/pid.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index c69c30d..020dedb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (!ns) ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { - if (type != PIDTYPE_PID) + if (type != PIDTYPE_PID) { + if (type == __PIDTYPE_TGID) + type = PIDTYPE_PID; task = task->group_leader; + } nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); } rcu_read_unlock(); @@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, } EXPORT_SYMBOL(__task_pid_nr_ns); -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_tgid(tsk), ns); -} -EXPORT_SYMBOL(task_tgid_nr_ns); - struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); -- cgit v1.1 From a8f0f9e49956a74718874b800251455680085600 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 17 Aug 2017 16:37:25 -0400 Subject: ftrace: Check for null ret_stack on profile function graph entry function There's a small race when function graph shutsdown and the calling of the registered function graph entry callback. The callback must not reference the task's ret_stack without first checking that it is not NULL. Note, when a ret_stack is allocated for a task, it stays allocated until the task exits. The problem here, is that function_graph is shutdown, and a new task was created, which doesn't have its ret_stack allocated. But since some of the functions are still being traced, the callbacks can still be called. The normal function_graph code handles this, but starting with commit 8861dd303c ("ftrace: Access ret_stack->subtime only in the function profiler") the profiler code references the ret_stack on function entry, but doesn't check if it is NULL first. Link: https://bugzilla.kernel.org/show_bug.cgi?id=196611 Cc: stable@vger.kernel.org Fixes: 8861dd303c ("ftrace: Access ret_stack->subtime only in the function profiler") Reported-by: lilydjwg@gmail.com Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 02004ae9..96cea88 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -889,6 +889,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace) function_profile_call(trace->func, 0, NULL, NULL); + /* If function graph is shutting down, ret_stack can be NULL */ + if (!current->ret_stack) + return 0; + if (index >= 0 && index < FTRACE_RETFUNC_DEPTH) current->ret_stack[index].subtime = 0; -- cgit v1.1 From 475bb3c69ab05df2a6ecef6acc2393703d134180 Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Mon, 14 Aug 2017 18:18:17 +0800 Subject: tracing: Fix kmemleak in tracing_map_array_free() kmemleak reported the below leak when I was doing clear of the hist trigger. With this patch, the kmeamleak is gone. unreferenced object 0xffff94322b63d760 (size 32): comm "bash", pid 1522, jiffies 4403687962 (age 2442.311s) hex dump (first 32 bytes): 00 01 00 00 04 00 00 00 08 00 00 00 ff 00 00 00 ................ 10 00 00 00 00 00 00 00 80 a8 7a f2 31 94 ff ff ..........z.1... backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc_trace+0xca/0x1d0 [] tracing_map_array_alloc+0x26/0x140 [] kretprobe_trampoline+0x0/0x50 [] create_hist_data+0x535/0x750 [] event_hist_trigger_func+0x1f7/0x420 [] event_trigger_write+0xfd/0x1a0 [] __vfs_write+0x37/0x170 [] vfs_write+0xb2/0x1b0 [] SyS_write+0x55/0xc0 [] do_syscall_64+0x67/0x150 [] return_from_SYSCALL_64+0x0/0x6a [] 0xffffffffffffffff unreferenced object 0xffff9431f27aa880 (size 128): comm "bash", pid 1522, jiffies 4403687962 (age 2442.311s) hex dump (first 32 bytes): 00 00 8c 2a 32 94 ff ff 00 f0 8b 2a 32 94 ff ff ...*2......*2... 00 e0 8b 2a 32 94 ff ff 00 d0 8b 2a 32 94 ff ff ...*2......*2... backtrace: [] kmemleak_alloc+0x4a/0xa0 [] __kmalloc+0xe8/0x220 [] tracing_map_array_alloc+0xb1/0x140 [] kretprobe_trampoline+0x0/0x50 [] create_hist_data+0x535/0x750 [] event_hist_trigger_func+0x1f7/0x420 [] event_trigger_write+0xfd/0x1a0 [] __vfs_write+0x37/0x170 [] vfs_write+0xb2/0x1b0 [] SyS_write+0x55/0xc0 [] do_syscall_64+0x67/0x150 [] return_from_SYSCALL_64+0x0/0x6a [] 0xffffffffffffffff Link: http://lkml.kernel.org/r/1502705898-27571-1-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: 08d43a5fa063 ("tracing: Add lock-free tracing_map") Signed-off-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/tracing_map.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 0a689bb..305039b 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -221,16 +221,19 @@ void tracing_map_array_free(struct tracing_map_array *a) if (!a) return; - if (!a->pages) { - kfree(a); - return; - } + if (!a->pages) + goto free; for (i = 0; i < a->n_pages; i++) { if (!a->pages[i]) break; free_page((unsigned long)a->pages[i]); } + + kfree(a->pages); + + free: + kfree(a); } struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, -- cgit v1.1 From 8b0db1a5bdfcee0dbfa89607672598ae203c9045 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 23 Aug 2017 12:46:27 -0400 Subject: tracing: Fix freeing of filter in create_filter() when set_str is false Performing the following task with kmemleak enabled: # cd /sys/kernel/tracing/events/irq/irq_handler_entry/ # echo 'enable_event:kmem:kmalloc:3 if irq >' > trigger # echo 'enable_event:kmem:kmalloc:3 if irq > 31' > trigger # echo scan > /sys/kernel/debug/kmemleak # cat /sys/kernel/debug/kmemleak unreferenced object 0xffff8800b9290308 (size 32): comm "bash", pid 1114, jiffies 4294848451 (age 141.139s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc_trace+0x158/0x290 [] create_filter_start.constprop.28+0x99/0x940 [] create_filter+0xa9/0x160 [] create_event_filter+0xc/0x10 [] set_trigger_filter+0xe5/0x210 [] event_enable_trigger_func+0x324/0x490 [] event_trigger_write+0x1a2/0x260 [] __vfs_write+0xd7/0x380 [] vfs_write+0x101/0x260 [] SyS_write+0xab/0x130 [] entry_SYSCALL_64_fastpath+0x1f/0xbe [] 0xffffffffffffffff The function create_filter() is passed a 'filterp' pointer that gets allocated, and if "set_str" is true, it is up to the caller to free it, even on error. The problem is that the pointer is not freed by create_filter() when set_str is false. This is a bug, and it is not up to the caller to free the filter on error if it doesn't care about the string. Link: http://lkml.kernel.org/r/1502705898-27571-2-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: 38b78eb85 ("tracing: Factorize filter creation") Reported-by: Chunyu Hu Tested-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 59a411f..181e139 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1959,6 +1959,10 @@ static int create_filter(struct trace_event_call *call, if (err && set_str) append_filter_err(ps, filter); } + if (err && !set_str) { + free_event_filter(filter); + filter = NULL; + } create_filter_finish(ps); *filterp = filter; -- cgit v1.1