From 692b48258dda7c302e777d7d5f4217244478f1f6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Oct 2017 08:04:13 -0700 Subject: workqueue: replace pool->manager_arb mutex with a flag Josef reported a HARDIRQ-safe -> HARDIRQ-unsafe lock order detected by lockdep: [ 1270.472259] WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected [ 1270.472783] 4.14.0-rc1-xfstests-12888-g76833e8 #110 Not tainted [ 1270.473240] ----------------------------------------------------- [ 1270.473710] kworker/u5:2/5157 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire: [ 1270.474239] (&(&lock->wait_lock)->rlock){+.+.}, at: [] __mutex_unlock_slowpath+0xa2/0x280 [ 1270.474994] [ 1270.474994] and this task is already holding: [ 1270.475440] (&pool->lock/1){-.-.}, at: [] worker_thread+0x366/0x3c0 [ 1270.476046] which would create a new lock dependency: [ 1270.476436] (&pool->lock/1){-.-.} -> (&(&lock->wait_lock)->rlock){+.+.} [ 1270.476949] [ 1270.476949] but this new dependency connects a HARDIRQ-irq-safe lock: [ 1270.477553] (&pool->lock/1){-.-.} ... [ 1270.488900] to a HARDIRQ-irq-unsafe lock: [ 1270.489327] (&(&lock->wait_lock)->rlock){+.+.} ... [ 1270.494735] Possible interrupt unsafe locking scenario: [ 1270.494735] [ 1270.495250] CPU0 CPU1 [ 1270.495600] ---- ---- [ 1270.495947] lock(&(&lock->wait_lock)->rlock); [ 1270.496295] local_irq_disable(); [ 1270.496753] lock(&pool->lock/1); [ 1270.497205] lock(&(&lock->wait_lock)->rlock); [ 1270.497744] [ 1270.497948] lock(&pool->lock/1); , which will cause a irq inversion deadlock if the above lock scenario happens. The root cause of this safe -> unsafe lock order is the mutex_unlock(pool->manager_arb) in manage_workers() with pool->lock held. Unlocking mutex while holding an irq spinlock was never safe and this problem has been around forever but it never got noticed because the only time the mutex is usually trylocked while holding irqlock making actual failures very unlikely and lockdep annotation missed the condition until the recent b9c16a0e1f73 ("locking/mutex: Fix lockdep_assert_held() fail"). Using mutex for pool->manager_arb has always been a bit of stretch. It primarily is an mechanism to arbitrate managership between workers which can easily be done with a pool flag. The only reason it became a mutex is that pool destruction path wants to exclude parallel managing operations. This patch replaces the mutex with a new pool flag POOL_MANAGER_ACTIVE and make the destruction path wait for the current manager on a wait queue. v2: Drop unnecessary flag clearing before pool destruction as suggested by Boqun. Signed-off-by: Tejun Heo Reported-by: Josef Bacik Reviewed-by: Lai Jiangshan Cc: Peter Zijlstra Cc: Boqun Feng Cc: stable@vger.kernel.org --- kernel/workqueue.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 64d0edf..a2dccfe 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -68,6 +68,7 @@ enum { * attach_mutex to avoid changing binding state while * worker_attach_to_pool() is in progress. */ + POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ /* worker flags */ @@ -165,7 +166,6 @@ struct worker_pool { /* L: hash of busy workers */ /* see manage_workers() for details on the two manager mutexes */ - struct mutex manager_arb; /* manager arbitration */ struct worker *manager; /* L: purely informational */ struct mutex attach_mutex; /* attach/detach exclusion */ struct list_head workers; /* A: attached workers */ @@ -299,6 +299,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ +static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */ static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ @@ -801,7 +802,7 @@ static bool need_to_create_worker(struct worker_pool *pool) /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { - bool managing = mutex_is_locked(&pool->manager_arb); + bool managing = pool->flags & POOL_MANAGER_ACTIVE; int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; @@ -1980,24 +1981,17 @@ static bool manage_workers(struct worker *worker) { struct worker_pool *pool = worker->pool; - /* - * Anyone who successfully grabs manager_arb wins the arbitration - * and becomes the manager. mutex_trylock() on pool->manager_arb - * failure while holding pool->lock reliably indicates that someone - * else is managing the pool and the worker which failed trylock - * can proceed to executing work items. This means that anyone - * grabbing manager_arb is responsible for actually performing - * manager duties. If manager_arb is grabbed and released without - * actual management, the pool may stall indefinitely. - */ - if (!mutex_trylock(&pool->manager_arb)) + if (pool->flags & POOL_MANAGER_ACTIVE) return false; + + pool->flags |= POOL_MANAGER_ACTIVE; pool->manager = worker; maybe_create_worker(pool); pool->manager = NULL; - mutex_unlock(&pool->manager_arb); + pool->flags &= ~POOL_MANAGER_ACTIVE; + wake_up(&wq_manager_wait); return true; } @@ -3248,7 +3242,6 @@ static int init_worker_pool(struct worker_pool *pool) setup_timer(&pool->mayday_timer, pool_mayday_timeout, (unsigned long)pool); - mutex_init(&pool->manager_arb); mutex_init(&pool->attach_mutex); INIT_LIST_HEAD(&pool->workers); @@ -3318,13 +3311,15 @@ static void put_unbound_pool(struct worker_pool *pool) hash_del(&pool->hash_node); /* - * Become the manager and destroy all workers. Grabbing - * manager_arb prevents @pool's workers from blocking on - * attach_mutex. + * Become the manager and destroy all workers. This prevents + * @pool's workers from blocking on attach_mutex. We're the last + * manager and @pool gets freed with the flag set. */ - mutex_lock(&pool->manager_arb); - spin_lock_irq(&pool->lock); + wait_event_lock_irq(wq_manager_wait, + !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock); + pool->flags |= POOL_MANAGER_ACTIVE; + while ((worker = first_idle_worker(pool))) destroy_worker(worker); WARN_ON(pool->nr_workers || pool->nr_idle); @@ -3338,8 +3333,6 @@ static void put_unbound_pool(struct worker_pool *pool) if (pool->detach_completion) wait_for_completion(pool->detach_completion); - mutex_unlock(&pool->manager_arb); - /* shut down the timers */ del_timer_sync(&pool->idle_timer); del_timer_sync(&pool->mayday_timer); -- cgit v1.1 From 20608924cc2e6bdeaf6f58ccbe9ddfe12dbfa082 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 4 Oct 2017 14:26:26 +0200 Subject: genirq: generic chip: Add irq_gc_mask_disable_and_ack_set() The irq_gc_mask_disable_reg_and_ack() function name implies that it provides the combined functions of irq_gc_mask_disable_reg() and irq_gc_ack(). However, the implementation does not actually do that since it writes the mask instead of the disable register. It also does not maintain the mask cache which makes it inappropriate to use with other masking functions. In addition, commit 659fb32d1b67 ("genirq: replace irq_gc_ack() with {set,clr}_bit variants (fwd)") effectively renamed irq_gc_ack() to irq_gc_ack_set_bit() so this function probably should have also been renamed at that time. The generic chip code currently provides three functions for use with the irq_mask member of the irq_chip structure and two functions for use with the irq_ack member of the irq_chip structure. These functions could be combined into six functions for use with the irq_mask_ack member of the irq_chip structure. However, since only one of the combinations is currently used, only the function irq_gc_mask_disable_and_ack_set() is added by this commit. The '_reg' and '_bit' portions of the base function name were left out of the new combined function name in an attempt to keep the function name length manageable with the 80 character source code line length while still allowing the distinct aspects of each combination to be captured by the name. If other combinations are desired in the future please add them to the irq generic chip library at that time. Signed-off-by: Doug Berger Signed-off-by: Marc Zyngier --- kernel/irq/generic-chip.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 5270a54..ec5fe9a 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -151,6 +151,31 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) } /** + * irq_gc_mask_disable_and_ack_set - Mask and ack pending interrupt + * @d: irq_data + * + * This generic implementation of the irq_mask_ack method is for chips + * with separate enable/disable registers instead of a single mask + * register and where a pending interrupt is acknowledged by setting a + * bit. + * + * Note: This is the only permutation currently used. Similar generic + * functions should be added here if other permutations are required. + */ +void irq_gc_mask_disable_and_ack_set(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; + + irq_gc_lock(gc); + irq_reg_writel(gc, mask, ct->regs.disable); + *ct->mask_cache &= ~mask; + irq_reg_writel(gc, mask, ct->regs.ack); + irq_gc_unlock(gc); +} + +/** * irq_gc_eoi - EOI interrupt * @d: irq_data */ -- cgit v1.1 From 0d08af35f16a0cc418ad2afde3bc5f70ace82705 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 4 Oct 2017 14:28:17 +0200 Subject: genirq: generic chip: remove irq_gc_mask_disable_reg_and_ack() Any usage of the irq_gc_mask_disable_reg_and_ack() function has been replaced with the desired functionality. The incorrect and ambiguously named function is removed here to prevent accidental misuse. Signed-off-by: Doug Berger Signed-off-by: Marc Zyngier --- kernel/irq/generic-chip.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index ec5fe9a..c26c5bb 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -135,22 +135,6 @@ void irq_gc_ack_clr_bit(struct irq_data *d) } /** - * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt - * @d: irq_data - */ -void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = d->mask; - - irq_gc_lock(gc); - irq_reg_writel(gc, mask, ct->regs.mask); - irq_reg_writel(gc, mask, ct->regs.ack); - irq_gc_unlock(gc); -} - -/** * irq_gc_mask_disable_and_ack_set - Mask and ack pending interrupt * @d: irq_data * -- cgit v1.1 From 1f7c70d6b2bc5de301f30456621e1161fddf4242 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 21 Oct 2017 16:06:52 +0200 Subject: cpu/hotplug: Reset node state after operation The recent rework of the cpu hotplug internals changed the usage of the per cpu state->node field, but missed to clean it up after usage. So subsequent hotplug operations use the stale pointer from a previous operation and hand it into the callback functions. The callbacks then dereference a pointer which either belongs to a different facility or points to freed and potentially reused memory. In either case data corruption and crashes are the obvious consequence. Reset the node and the last pointers in the per cpu state to NULL after the operation which set them has completed. Fixes: 96abb968549c ("smp/hotplug: Allow external multi-instance rollback") Reported-by: Tvrtko Ursulin Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Boris Ostrovsky Cc: "Paul E. McKenney" Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710211606130.3213@nanos --- kernel/cpu.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d851df2..04892a8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -632,6 +632,11 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, __cpuhp_kick_ap(st); } + /* + * Clean up the leftovers so the next hotplug operation wont use stale + * data. + */ + st->node = st->last = NULL; return ret; } -- cgit v1.1 From 8108a77515126f6db4374e8593956e20430307c0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 27 Oct 2017 09:45:34 -0700 Subject: bpf: bpf_compute_data uses incorrect cb structure SK_SKB program types use bpf_compute_data to store the end of the packet data. However, bpf_compute_data assumes the cb is stored in the qdisc layer format. But, for SK_SKB this is the wrong layer of the stack for this type. It happens to work (sort of!) because in most cases nothing happens to be overwritten today. This is very fragile and error prone. Fortunately, we have another hole in tcp_skb_cb we can use so lets put the data_end value there. Note, SK_SKB program types do not use data_meta, they are failed by sk_skb_is_valid_access(). Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 2b6eb35..6778fb7 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -93,6 +93,14 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } +/* compute the linear packet data range [data, data_end) for skb when + * sk_skb type programs are in use. + */ +static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); +} + static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) { struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); @@ -108,7 +116,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) */ TCP_SKB_CB(skb)->bpf.map = NULL; skb->sk = psock->sock; - bpf_compute_data_end(skb); + bpf_compute_data_end_sk_skb(skb); preempt_disable(); rc = (*prog->bpf_func)(skb, prog->insnsi); preempt_enable(); @@ -368,7 +376,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_end(skb); + bpf_compute_data_end_sk_skb(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); -- cgit v1.1 From bfa640757e9378c2f26867e723f1287e94f5a7ad Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 27 Oct 2017 09:45:53 -0700 Subject: bpf: rename sk_actions to align with bpf infrastructure Recent additions to support multiple programs in cgroups impose a strict requirement, "all yes is yes, any no is no". To enforce this the infrastructure requires the 'no' return code, SK_DROP in this case, to be 0. To apply these rules to SK_SKB program types the sk_actions return codes need to be adjusted. This fix adds SK_PASS and makes 'SK_DROP = 0'. Finally, remove SK_ABORTED to remove any chance that the API may allow aborted program flows to be passed up the stack. This would be incorrect behavior and allow programs to break existing policies. Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 6778fb7..66f00a2 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -122,7 +122,8 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) preempt_enable(); skb->sk = NULL; - return rc; + return rc == SK_PASS ? + (TCP_SKB_CB(skb)->bpf.map ? SK_REDIRECT : SK_PASS) : SK_DROP; } static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) -- cgit v1.1