From 449e9cae58b06be1293858ec8e5d8cb728238baa Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 1 Jun 2015 16:05:16 +0800 Subject: genirq: Move field 'node' from irq_data into irq_common_data NUMA node information is per-irq instead of per-irqchip, so move it into struct irq_common_data. Also use CONFIG_NUMA to guard irq_common_data.node. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Jason Cooper Cc: Kevin Cernekee Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/1433145945-789-8-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 2 +- kernel/irq/irqdesc.c | 4 +++- kernel/irq/irqdomain.c | 1 - 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index eee4b38..5ef0c2d 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -194,7 +194,7 @@ static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) static inline int irq_desc_get_node(struct irq_desc *desc) { - return irq_data_get_node(&desc->irq_data); + return irq_common_data_get_node(&desc->irq_common_data); } #ifdef CONFIG_PM_SLEEP diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 0a2a4b6..7f3e9fa 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -52,11 +52,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) static void desc_smp_init(struct irq_desc *desc, int node) { - desc->irq_data.node = node; cpumask_copy(desc->irq_data.affinity, irq_default_affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif +#ifdef CONFIG_NUMA + desc->irq_common_data.node = node; +#endif } #else diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 79baaf8..dc9d27c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -844,7 +844,6 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, child->parent_data = irq_data; irq_data->irq = child->irq; irq_data->common = child->common; - irq_data->node = child->node; irq_data->domain = domain; } -- cgit v1.1 From af7080e040d223b5e7d0a8de28f7cea24ef017c4 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 1 Jun 2015 16:05:21 +0800 Subject: genirq: Move field 'handler_data' from irq_data into irq_common_data Handler data (handler_data) is per-irq instead of per irqchip, so move it into struct irq_common_data. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Jason Cooper Cc: Kevin Cernekee Cc: Arnd Bergmann Cc: Marc Zyngier Link: http://lkml.kernel.org/r/1433145945-789-13-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 4 ++-- kernel/irq/irqdesc.c | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6e40a95..a48e00e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -83,7 +83,7 @@ int irq_set_handler_data(unsigned int irq, void *data) if (!desc) return -EINVAL; - desc->irq_data.handler_data = data; + desc->irq_common_data.handler_data = data; irq_put_desc_unlock(desc, flags); return 0; } @@ -796,7 +796,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, return; __irq_do_set_handler(desc, handle, 1, NULL); - desc->irq_data.handler_data = data; + desc->irq_common_data.handler_data = data; irq_put_desc_busunlock(desc, flags); } diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 7f3e9fa..594b3e3 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -72,11 +72,12 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, { int cpu; + desc->irq_common_data.handler_data = NULL; + desc->irq_data.common = &desc->irq_common_data; desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; - desc->irq_data.handler_data = NULL; desc->irq_data.msi_desc = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); -- cgit v1.1 From 9df872faa7e1619e9278bec00ceaed2236533530 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jun 2015 11:47:50 +0800 Subject: genirq: Move field 'affinity' from irq_data into irq_common_data Irq affinity mask is per-irq instead of per irqchip, so move it into struct irq_common_data. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Jason Cooper Cc: Kevin Cernekee Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/1433303281-27688-1-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 9 +++++---- kernel/irq/manage.c | 12 ++++++------ kernel/irq/proc.c | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 594b3e3..bb48a5c 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -38,12 +38,13 @@ static void __init init_irq_default_affinity(void) #ifdef CONFIG_SMP static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { - if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) + if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity, + gfp, node)) return -ENOMEM; #ifdef CONFIG_GENERIC_PENDING_IRQ if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { - free_cpumask_var(desc->irq_data.affinity); + free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } #endif @@ -52,7 +53,7 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) static void desc_smp_init(struct irq_desc *desc, int node) { - cpumask_copy(desc->irq_data.affinity, irq_default_affinity); + cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif @@ -124,7 +125,7 @@ static void free_masks(struct irq_desc *desc) #ifdef CONFIG_GENERIC_PENDING_IRQ free_cpumask_var(desc->pending_mask); #endif - free_cpumask_var(desc->irq_data.affinity); + free_cpumask_var(desc->irq_common_data.affinity); } #else static inline void free_masks(struct irq_desc *desc) { } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ad1b064..f9a59f6 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -192,7 +192,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: - cpumask_copy(data->affinity, mask); + cpumask_copy(desc->irq_common_data.affinity, mask); case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); ret = 0; @@ -304,7 +304,7 @@ static void irq_affinity_notify(struct work_struct *work) if (irq_move_pending(&desc->irq_data)) irq_get_pending(cpumask, desc); else - cpumask_copy(cpumask, desc->irq_data.affinity); + cpumask_copy(cpumask, desc->irq_common_data.affinity); raw_spin_unlock_irqrestore(&desc->lock, flags); notify->notify(notify, cpumask); @@ -375,9 +375,9 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) * one of the targets is online. */ if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { - if (cpumask_intersects(desc->irq_data.affinity, + if (cpumask_intersects(desc->irq_common_data.affinity, cpu_online_mask)) - set = desc->irq_data.affinity; + set = desc->irq_common_data.affinity; else irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); } @@ -829,8 +829,8 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) * This code is triggered unconditionally. Check the affinity * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. */ - if (desc->irq_data.affinity) - cpumask_copy(mask, desc->irq_data.affinity); + if (desc->irq_common_data.affinity) + cpumask_copy(mask, desc->irq_common_data.affinity); else valid = false; raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 0e97c14..e3a8c95 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -39,7 +39,7 @@ static struct proc_dir_entry *root_irq_dir; static int show_irq_affinity(int type, struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - const struct cpumask *mask = desc->irq_data.affinity; + const struct cpumask *mask = desc->irq_common_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ if (irqd_is_setaffinity_pending(&desc->irq_data)) -- cgit v1.1 From b237721c5d95082a803c0be686f56d2dd1de995b Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 1 Jun 2015 16:05:43 +0800 Subject: genirq: Move field 'msi_desc' from irq_data into irq_common_data MSI descriptors are per-irq instead of per irqchip, so move it into struct irq_common_data. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Jason Cooper Cc: Kevin Cernekee Cc: Arnd Bergmann Cc: Marc Zyngier Link: http://lkml.kernel.org/r/1433145945-789-35-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 +- kernel/irq/irqdesc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a48e00e..8c55d54 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -105,7 +105,7 @@ int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, if (!desc) return -EINVAL; - desc->irq_data.msi_desc = entry; + desc->irq_common_data.msi_desc = entry; if (entry && !irq_offset) entry->irq = irq_base; irq_put_desc_unlock(desc, flags); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index bb48a5c..5966694 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -74,12 +74,12 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, int cpu; desc->irq_common_data.handler_data = NULL; + desc->irq_common_data.msi_desc = NULL; desc->irq_data.common = &desc->irq_common_data; desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; - desc->irq_data.msi_desc = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); desc->handle_irq = handle_bad_irq; -- cgit v1.1 From bd0b9ac405e1794d72533c3d487aa65b6b955a0c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 14 Sep 2015 10:42:37 +0200 Subject: genirq: Remove irq argument from irq flow handlers Most interrupt flow handlers do not use the irq argument. Those few which use it can retrieve the irq number from the irq descriptor. Remove the argument. Search and replace was done with coccinelle and some extra helper scripts around it. Thanks to Julia for her help! Signed-off-by: Thomas Gleixner Cc: Julia Lawall Cc: Jiang Liu --- kernel/irq/chip.c | 27 ++++++++------------------- kernel/irq/handle.c | 4 +++- kernel/irq/irqdesc.c | 2 +- kernel/irq/resend.c | 2 +- 4 files changed, 13 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 8c55d54..e28169d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -372,7 +372,6 @@ static bool irq_may_run(struct irq_desc *desc) /** * handle_simple_irq - Simple and software-decoded IRQs. - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Simple interrupts are either sent from a demultiplexing interrupt @@ -382,8 +381,7 @@ static bool irq_may_run(struct irq_desc *desc) * Note: The caller is expected to handle the ack, clear, mask and * unmask issues if necessary. */ -void -handle_simple_irq(unsigned int irq, struct irq_desc *desc) +void handle_simple_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); @@ -425,7 +423,6 @@ static void cond_unmask_irq(struct irq_desc *desc) /** * handle_level_irq - Level type irq handler - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Level type interrupts are active as long as the hardware line has @@ -433,8 +430,7 @@ static void cond_unmask_irq(struct irq_desc *desc) * it after the associated handler has acknowledged the device, so the * interrupt line is back to inactive. */ -void -handle_level_irq(unsigned int irq, struct irq_desc *desc) +void handle_level_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); mask_ack_irq(desc); @@ -496,7 +492,6 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) /** * handle_fasteoi_irq - irq handler for transparent controllers - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Only a single callback will be issued to the chip: an ->eoi() @@ -504,8 +499,7 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) * for modern forms of interrupt handlers, which handle the flow * details in hardware, transparently. */ -void -handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) +void handle_fasteoi_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; @@ -546,7 +540,6 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq); /** * handle_edge_irq - edge type IRQ handler - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Interrupt occures on the falling and/or rising edge of a hardware @@ -560,8 +553,7 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq); * the handler was running. If all pending interrupts are handled, the * loop is left. */ -void -handle_edge_irq(unsigned int irq, struct irq_desc *desc) +void handle_edge_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); @@ -618,13 +610,12 @@ EXPORT_SYMBOL(handle_edge_irq); #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER /** * handle_edge_eoi_irq - edge eoi type IRQ handler - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Similar as the above handle_edge_irq, but using eoi and w/o the * mask/unmask logic. */ -void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) +void handle_edge_eoi_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); @@ -665,13 +656,11 @@ out_eoi: /** * handle_percpu_irq - Per CPU local irq handler - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Per CPU interrupts on SMP machines without locking requirements */ -void -handle_percpu_irq(unsigned int irq, struct irq_desc *desc) +void handle_percpu_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); @@ -688,7 +677,6 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) /** * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Per CPU interrupts on SMP machines without locking requirements. Same as @@ -698,11 +686,12 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) * contain the real device id for the cpu on which this handler is * called */ -void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) +void handle_percpu_devid_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; void *dev_id = raw_cpu_ptr(action->percpu_dev_id); + unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; kstat_incr_irqs_this_cpu(desc); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index b6eeea8..de41a68 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -27,8 +27,10 @@ * * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ -void handle_bad_irq(unsigned int irq, struct irq_desc *desc) +void handle_bad_irq(struct irq_desc *desc) { + unsigned int irq = irq_desc_get_irq(desc); + print_irq_desc(irq, desc); kstat_incr_irqs_this_cpu(desc); ack_bad_irq(irq); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 5966694..239e2ae 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -347,7 +347,7 @@ int generic_handle_irq(unsigned int irq) if (!desc) return -EINVAL; - generic_handle_irq_desc(irq, desc); + generic_handle_irq_desc(desc); return 0; } EXPORT_SYMBOL_GPL(generic_handle_irq); diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index dd95f44..b86886b 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -38,7 +38,7 @@ static void resend_irqs(unsigned long arg) clear_bit(irq, irqs_resend); desc = irq_to_desc(irq); local_irq_disable(); - desc->handle_irq(irq, desc); + desc->handle_irq(desc); local_irq_enable(); } } -- cgit v1.1 From f9f9e7b776142fb1c0782cade004cc8e0147a199 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Sep 2015 11:51:12 -0400 Subject: Revert "cgroup: simplify threadgroup locking" This reverts commit b5ba75b5fc0e8404e2c50cb68f39bb6a53fc916f. d59cfc09c32a ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem") and b5ba75b5fc0e ("cgroup: simplify threadgroup locking") changed how cgroup synchronizes against task fork and exits so that it uses global percpu_rwsem instead of per-process rwsem; unfortunately, the write [un]lock paths of percpu_rwsem always involve synchronize_rcu_expedited() which turned out to be too expensive. Improvements for percpu_rwsem are scheduled to be merged in the coming v4.4-rc1 merge window which alleviates this issue. For now, revert the two commits to restore per-process rwsem. They will be re-applied for the v4.4-rc1 merge window. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/g/55F8097A.7000206@de.ibm.com Reported-by: Christian Borntraeger Cc: Oleg Nesterov Cc: "Paul E. McKenney" Cc: Peter Zijlstra Cc: Paolo Bonzini Cc: stable@vger.kernel.org # v4.2+ --- kernel/cgroup.c | 45 +++++++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2cf0f79..115091e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2460,13 +2460,14 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, if (!cgrp) return -ENODEV; - percpu_down_write(&cgroup_threadgroup_rwsem); +retry_find_task: rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { + rcu_read_unlock(); ret = -ESRCH; - goto out_unlock_rcu; + goto out_unlock_cgroup; } } else { tsk = current; @@ -2482,23 +2483,37 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, */ if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; - goto out_unlock_rcu; + rcu_read_unlock(); + goto out_unlock_cgroup; } get_task_struct(tsk); rcu_read_unlock(); + percpu_down_write(&cgroup_threadgroup_rwsem); + if (threadgroup) { + if (!thread_group_leader(tsk)) { + /* + * a race with de_thread from another thread's exec() + * may strip us of our leadership, if this happens, + * there is no choice but to throw this task away and + * try again; this is + * "double-double-toil-and-trouble-check locking". + */ + percpu_up_write(&cgroup_threadgroup_rwsem); + put_task_struct(tsk); + goto retry_find_task; + } + } + ret = cgroup_procs_write_permission(tsk, cgrp, of); if (!ret) ret = cgroup_attach_task(cgrp, tsk, threadgroup); - put_task_struct(tsk); - goto out_unlock_threadgroup; - -out_unlock_rcu: - rcu_read_unlock(); -out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); + + put_task_struct(tsk); +out_unlock_cgroup: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } @@ -2643,8 +2658,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); - /* look up all csses currently attached to @cgrp's subtree */ down_read(&css_set_rwsem); css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { @@ -2700,8 +2713,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) goto out_finish; last_task = task; + percpu_down_write(&cgroup_threadgroup_rwsem); + /* raced against de_thread() from another thread? */ + if (!thread_group_leader(task)) { + percpu_up_write(&cgroup_threadgroup_rwsem); + put_task_struct(task); + continue; + } + ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); + percpu_up_write(&cgroup_threadgroup_rwsem); put_task_struct(task); if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) @@ -2711,7 +2733,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) out_finish: cgroup_migrate_finish(&preloaded_csets); - percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } -- cgit v1.1 From 0c986253b939cc14c69d4adbe2b4121bdf4aa220 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Sep 2015 11:51:12 -0400 Subject: Revert "sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem" This reverts commit d59cfc09c32a2ae31f1c3bc2983a0cd79afb3f14. d59cfc09c32a ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem") and b5ba75b5fc0e ("cgroup: simplify threadgroup locking") changed how cgroup synchronizes against task fork and exits so that it uses global percpu_rwsem instead of per-process rwsem; unfortunately, the write [un]lock paths of percpu_rwsem always involve synchronize_rcu_expedited() which turned out to be too expensive. Improvements for percpu_rwsem are scheduled to be merged in the coming v4.4-rc1 merge window which alleviates this issue. For now, revert the two commits to restore per-process rwsem. They will be re-applied for the v4.4-rc1 merge window. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/g/55F8097A.7000206@de.ibm.com Reported-by: Christian Borntraeger Cc: Oleg Nesterov Cc: "Paul E. McKenney" Cc: Peter Zijlstra Cc: Paolo Bonzini Cc: stable@vger.kernel.org # v4.2+ --- kernel/cgroup.c | 77 ++++++++++++++++++++++++++++++++++++++++++--------------- kernel/fork.c | 4 +++ 2 files changed, 61 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 115091e..2c9eae6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include @@ -104,8 +103,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(release_agent_path_lock); -struct percpu_rw_semaphore cgroup_threadgroup_rwsem; - #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&cgroup_mutex), \ @@ -874,6 +871,48 @@ static struct css_set *find_css_set(struct css_set *old_cset, return cset; } +void cgroup_threadgroup_change_begin(struct task_struct *tsk) +{ + down_read(&tsk->signal->group_rwsem); +} + +void cgroup_threadgroup_change_end(struct task_struct *tsk) +{ + up_read(&tsk->signal->group_rwsem); +} + +/** + * threadgroup_lock - lock threadgroup + * @tsk: member task of the threadgroup to lock + * + * Lock the threadgroup @tsk belongs to. No new task is allowed to enter + * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or + * change ->group_leader/pid. This is useful for cases where the threadgroup + * needs to stay stable across blockable operations. + * + * fork and exit explicitly call threadgroup_change_{begin|end}() for + * synchronization. While held, no new task will be added to threadgroup + * and no existing live task will have its PF_EXITING set. + * + * de_thread() does threadgroup_change_{begin|end}() when a non-leader + * sub-thread becomes a new leader. + */ +static void threadgroup_lock(struct task_struct *tsk) +{ + down_write(&tsk->signal->group_rwsem); +} + +/** + * threadgroup_unlock - unlock threadgroup + * @tsk: member task of the threadgroup to unlock + * + * Reverse threadgroup_lock(). + */ +static inline void threadgroup_unlock(struct task_struct *tsk) +{ + up_write(&tsk->signal->group_rwsem); +} + static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kf_root->kn->priv; @@ -2074,9 +2113,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, lockdep_assert_held(&css_set_rwsem); /* - * We are synchronized through cgroup_threadgroup_rwsem against - * PF_EXITING setting such that we can't race against cgroup_exit() - * changing the css_set to init_css_set and dropping the old one. + * We are synchronized through threadgroup_lock() against PF_EXITING + * setting such that we can't race against cgroup_exit() changing the + * css_set to init_css_set and dropping the old one. */ WARN_ON_ONCE(tsk->flags & PF_EXITING); old_cset = task_css_set(tsk); @@ -2133,11 +2172,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) * @src_cset and add it to @preloaded_csets, which should later be cleaned * up by cgroup_migrate_finish(). * - * This function may be called without holding cgroup_threadgroup_rwsem - * even if the target is a process. Threads may be created and destroyed - * but as long as cgroup_mutex is not dropped, no new css_set can be put - * into play and the preloaded css_sets are guaranteed to cover all - * migrations. + * This function may be called without holding threadgroup_lock even if the + * target is a process. Threads may be created and destroyed but as long + * as cgroup_mutex is not dropped, no new css_set can be put into play and + * the preloaded css_sets are guaranteed to cover all migrations. */ static void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, @@ -2240,7 +2278,7 @@ err: * @threadgroup: whether @leader points to the whole process or a single task * * Migrate a process or task denoted by @leader to @cgrp. If migrating a - * process, the caller must be holding cgroup_threadgroup_rwsem. The + * process, the caller must be holding threadgroup_lock of @leader. The * caller is also responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). @@ -2368,7 +2406,7 @@ out_release_tset: * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * - * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. + * Call holding cgroup_mutex and threadgroup_lock of @leader. */ static int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) @@ -2490,7 +2528,7 @@ retry_find_task: get_task_struct(tsk); rcu_read_unlock(); - percpu_down_write(&cgroup_threadgroup_rwsem); + threadgroup_lock(tsk); if (threadgroup) { if (!thread_group_leader(tsk)) { /* @@ -2500,7 +2538,7 @@ retry_find_task: * try again; this is * "double-double-toil-and-trouble-check locking". */ - percpu_up_write(&cgroup_threadgroup_rwsem); + threadgroup_unlock(tsk); put_task_struct(tsk); goto retry_find_task; } @@ -2510,7 +2548,7 @@ retry_find_task: if (!ret) ret = cgroup_attach_task(cgrp, tsk, threadgroup); - percpu_up_write(&cgroup_threadgroup_rwsem); + threadgroup_unlock(tsk); put_task_struct(tsk); out_unlock_cgroup: @@ -2713,17 +2751,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) goto out_finish; last_task = task; - percpu_down_write(&cgroup_threadgroup_rwsem); + threadgroup_lock(task); /* raced against de_thread() from another thread? */ if (!thread_group_leader(task)) { - percpu_up_write(&cgroup_threadgroup_rwsem); + threadgroup_unlock(task); put_task_struct(task); continue; } ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); - percpu_up_write(&cgroup_threadgroup_rwsem); + threadgroup_unlock(task); put_task_struct(task); if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) @@ -5045,7 +5083,6 @@ int __init cgroup_init(void) unsigned long key; int ssid, err; - BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); diff --git a/kernel/fork.c b/kernel/fork.c index 7d5f0f1..2845623 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1149,6 +1149,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->group_rwsem); +#endif + sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; -- cgit v1.1 From de9b8f5dcbd94bfb1d249907a635f1fb1968e19c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2015 23:09:29 +0200 Subject: sched: Fix crash trying to dequeue/enqueue the idle thread Sasha reports that his virtual machine tries to schedule the idle thread since commit 6c37067e2786 ("sched: Change the sched_class::set_cpus_allowed() calling context"). Hit trace shows this happening from idle_thread_get()->init_idle(), which is the _second_ init_idle() invocation on that task_struct, the first being done through idle_init()->fork_idle(). (this code is insane...) Because we call init_idle() twice in a row, its ->sched_class == &idle_sched_class and ->on_rq = TASK_ON_RQ_QUEUED. This means do_set_cpus_allowed() think we're queued and will call dequeue_task(), which is implemented with BUG() for the idle class, seeing how dequeueing the idle task is a daft thing. Aside of the whole insanity of calling init_idle() _twice_, change the code to call set_cpus_allowed_common() instead as this is 'obviously' before the idle task gets ran etc.. Reported-by: Sasha Levin Tested-by: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 6c37067e2786 ("sched: Change the sched_class::set_cpus_allowed() calling context") Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 97d276f..f0d043e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4927,7 +4927,15 @@ void init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); - do_set_cpus_allowed(idle, cpumask_of(cpu)); +#ifdef CONFIG_SMP + /* + * Its possible that init_idle() gets called multiple times on a task, + * in that case do_set_cpus_allowed() will not do the right thing. + * + * And since this is boot we can forgo the serialization. + */ + set_cpus_allowed_common(idle, cpumask_of(cpu)); +#endif /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the cpu isn't yet set to this cpu so the @@ -4944,7 +4952,7 @@ void init_idle(struct task_struct *idle, int cpu) rq->curr = rq->idle = idle; idle->on_rq = TASK_ON_RQ_QUEUED; -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP idle->on_cpu = 1; #endif raw_spin_unlock(&rq->lock); @@ -4959,7 +4967,7 @@ void init_idle(struct task_struct *idle, int cpu) idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); vtime_init_idle(idle, cpu); -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif } -- cgit v1.1 From f55fc2a57cc9ca3b1bb4fb8eb25b6e1989e5b993 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Sep 2015 19:06:33 +0200 Subject: perf: Restructure perf syscall point of no return The exclusive_event_installable() stuff only works because its exclusive with the grouping bits. Rework the code such that there is a sane place to error out before we go do things we cannot undo. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 49 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f548f69..39679f7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8297,13 +8297,30 @@ SYSCALL_DEFINE5(perf_event_open, if (move_group) { gctx = group_leader->ctx; + mutex_lock_double(&gctx->mutex, &ctx->mutex); + } else { + mutex_lock(&ctx->mutex); + } + + /* + * Must be under the same ctx::mutex as perf_install_in_context(), + * because we need to serialize with concurrent event creation. + */ + if (!exclusive_event_installable(event, ctx)) { + /* exclusive and group stuff are assumed mutually exclusive */ + WARN_ON_ONCE(move_group); + + err = -EBUSY; + goto err_locked; + } + WARN_ON_ONCE(ctx->parent_ctx); + + if (move_group) { /* * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ - mutex_lock_double(&gctx->mutex, &ctx->mutex); - perf_remove_from_context(group_leader, false); list_for_each_entry(sibling, &group_leader->sibling_list, @@ -8311,13 +8328,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_remove_from_context(sibling, false); put_ctx(gctx); } - } else { - mutex_lock(&ctx->mutex); - } - - WARN_ON_ONCE(ctx->parent_ctx); - if (move_group) { /* * Wait for everybody to stop referencing the events through * the old lists, before installing it on new lists. @@ -8349,22 +8360,20 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); get_ctx(ctx); - } - if (!exclusive_event_installable(event, ctx)) { - err = -EBUSY; - mutex_unlock(&ctx->mutex); - fput(event_file); - goto err_context; + /* + * Now that all events are installed in @ctx, nothing + * references @gctx anymore, so drop the last reference we have + * on it. + */ + put_ctx(gctx); } perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); - if (move_group) { + if (move_group) mutex_unlock(&gctx->mutex); - put_ctx(gctx); - } mutex_unlock(&ctx->mutex); put_online_cpus(); @@ -8391,6 +8400,12 @@ SYSCALL_DEFINE5(perf_event_open, fd_install(event_fd, event_file); return event_fd; +err_locked: + if (move_group) + mutex_unlock(&gctx->mutex); + mutex_unlock(&ctx->mutex); +/* err_file: */ + fput(event_file); err_context: perf_unpin_context(ctx); put_ctx(ctx); -- cgit v1.1 From a723968c0ed36db676478c3d26078f13484fe01c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Sep 2015 19:06:33 +0200 Subject: perf: Fix u16 overflows Vince reported that its possible to overflow the various size fields and get weird stuff if you stick too many events in a group. Put a lid on this by requiring the fixed record size not exceed 16k. This is still a fair amount of events (silly amount really) and leaves plenty room for callchains and stack dwarves while also avoiding overflowing the u16 variables. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 50 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 39679f7..dbb5329 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1243,11 +1243,7 @@ static inline void perf_event__state_init(struct perf_event *event) PERF_EVENT_STATE_INACTIVE; } -/* - * Called at perf_event creation and when events are attached/detached from a - * group. - */ -static void perf_event__read_size(struct perf_event *event) +static void __perf_event_read_size(struct perf_event *event, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; @@ -1263,7 +1259,7 @@ static void perf_event__read_size(struct perf_event *event) entry += sizeof(u64); if (event->attr.read_format & PERF_FORMAT_GROUP) { - nr += event->group_leader->nr_siblings; + nr += nr_siblings; size += sizeof(u64); } @@ -1271,14 +1267,11 @@ static void perf_event__read_size(struct perf_event *event) event->read_size = size; } -static void perf_event__header_size(struct perf_event *event) +static void __perf_event_header_size(struct perf_event *event, u64 sample_type) { struct perf_sample_data *data; - u64 sample_type = event->attr.sample_type; u16 size = 0; - perf_event__read_size(event); - if (sample_type & PERF_SAMPLE_IP) size += sizeof(data->ip); @@ -1303,6 +1296,17 @@ static void perf_event__header_size(struct perf_event *event) event->header_size = size; } +/* + * Called at perf_event creation and when events are attached/detached from a + * group. + */ +static void perf_event__header_size(struct perf_event *event) +{ + __perf_event_read_size(event, + event->group_leader->nr_siblings); + __perf_event_header_size(event, event->attr.sample_type); +} + static void perf_event__id_header_size(struct perf_event *event) { struct perf_sample_data *data; @@ -1330,6 +1334,27 @@ static void perf_event__id_header_size(struct perf_event *event) event->id_header_size = size; } +static bool perf_event_validate_size(struct perf_event *event) +{ + /* + * The values computed here will be over-written when we actually + * attach the event. + */ + __perf_event_read_size(event, event->group_leader->nr_siblings + 1); + __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); + perf_event__id_header_size(event); + + /* + * Sum the lot; should not exceed the 64k limit we have on records. + * Conservative limit to allow for callchains and other variable fields. + */ + if (event->read_size + event->header_size + + event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) + return false; + + return true; +} + static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader, *pos; @@ -8302,6 +8327,11 @@ SYSCALL_DEFINE5(perf_event_open, mutex_lock(&ctx->mutex); } + if (!perf_event_validate_size(event)) { + err = -E2BIG; + goto err_locked; + } + /* * Must be under the same ctx::mutex as perf_install_in_context(), * because we need to serialize with concurrent event creation. -- cgit v1.1 From f73e22ab450140830005581c2c7ec389791a1b8d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Sep 2015 20:48:22 +0200 Subject: perf: Fix races in computing the header sizes There are two races with the current code: - Another event can join the group and compute a larger header_size concurrently, if the smaller store wins we'll have an incorrect header_size set. - We compute the header_size after the event becomes active, therefore its possible to use the size before its computed. Remedy the first by moving the computation inside the ctx::mutex lock, and the second by placing it _before_ perf_install_in_context(). Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index dbb5329..b11756f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8399,6 +8399,15 @@ SYSCALL_DEFINE5(perf_event_open, put_ctx(gctx); } + /* + * Precalculate sample_data sizes; do while holding ctx::mutex such + * that we're serialized against further additions and before + * perf_install_in_context() which is the point the event is active and + * can use these values. + */ + perf_event__header_size(event); + perf_event__id_header_size(event); + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); @@ -8415,12 +8424,6 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(¤t->perf_event_mutex); /* - * Precalculate sample_data sizes - */ - perf_event__header_size(event); - perf_event__id_header_size(event); - - /* * Drop the reference on the group_event after placing the * new event on the sibling_list. This ensures destruction * of the group leader will find the pointer to itself in -- cgit v1.1 From 00cc1633816de8c95f337608a1ea64e228faf771 Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Fri, 18 Sep 2015 11:27:45 +0200 Subject: sched: access local runqueue directly in single_task_running Commit 2ee507c47293 ("sched: Add function single_task_running to let a task check if it is the only task running on a cpu") referenced the current runqueue with the smp_processor_id. When CONFIG_DEBUG_PREEMPT is enabled, that is only allowed if preemption is disabled or the currrent task is bound to the local cpu (e.g. kernel worker). With commit f78195129963 ("kvm: add halt_poll_ns module parameter") KVM calls single_task_running. If CONFIG_DEBUG_PREEMPT is enabled that generates a lot of kernel messages. To avoid adding preemption in that cases, as it would limit the usefulness, we change single_task_running to access directly the cpu local runqueue. Cc: Tim Chen Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra (Intel) Cc: Fixes: 2ee507c472939db4b146d545352b8a7c79ef47f8 Signed-off-by: Dominik Dingel Signed-off-by: Paolo Bonzini --- kernel/sched/core.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3595403..4064f79 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2666,13 +2666,20 @@ unsigned long nr_running(void) /* * Check if only the current task is running on the cpu. + * + * Caution: this function does not check that the caller has disabled + * preemption, thus the result might have a time-of-check-to-time-of-use + * race. The caller is responsible to use it correctly, for example: + * + * - from a non-preemptable section (of course) + * + * - from a thread that is bound to a single CPU + * + * - in a loop with very short iterations (e.g. a polling loop) */ bool single_task_running(void) { - if (cpu_rq(smp_processor_id())->nr_running == 1) - return true; - else - return false; + return raw_rq()->nr_running == 1; } EXPORT_SYMBOL(single_task_running); -- cgit v1.1 From 19a5ecde086a6a5287978b12ae948fa691b197b7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 20 Sep 2015 21:01:22 -0700 Subject: rcu: Suppress lockdep false positive for rcp->exp_funnel_mutex In kernels built with CONFIG_PREEMPT=y, synchronize_rcu_expedited() invokes synchronize_sched_expedited() while holding RCU-preempt's root rcu_node structure's ->exp_funnel_mutex, which is acquired after the rcu_data structure's ->exp_funnel_mutex. The first thing that synchronize_sched_expedited() will do is acquire RCU-sched's rcu_data structure's ->exp_funnel_mutex. There is no danger of an actual deadlock because the locking order is always from RCU-preempt's expedited mutexes to those of RCU-sched. Unfortunately, lockdep considers both rcu_data structures' ->exp_funnel_mutex to be in the same lock class and therefore reports a deadlock cycle. This commit silences this false positive by placing RCU-sched's rcu_data structures' ->exp_funnel_mutex locks into their own lock class. Reported-by: Sasha Levin Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9f75f25..775d36c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3868,6 +3868,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) static void __init rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { + static struct lock_class_key rcu_exp_sched_rdp_class; unsigned long flags; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); @@ -3883,6 +3884,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (rsp == &rcu_sched_state) + lockdep_set_class_and_name(&rdp->exp_funnel_mutex, + &rcu_exp_sched_rdp_class, + "rcu_data_exp_sched"); } /* -- cgit v1.1 From ac5be6b47e8bd25b62bed2c82cda7398999f59e9 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 22 Sep 2015 14:58:49 -0700 Subject: userfaultfd: revert "userfaultfd: waitqueue: add nr wake parameter to __wake_up_locked_key" This reverts commit 51360155eccb907ff8635bd10fc7de876408c2e0 and adapts fs/userfaultfd.c to use the old version of that function. It didn't look robust to call __wake_up_common with "nr == 1" when we absolutely require wakeall semantics, but we've full control of what we insert in the two waitqueue heads of the blocked userfaults. No exclusive waitqueue risks to be inserted into those two waitqueue heads so we can as well stick to "nr == 1" of the old code and we can rely purely on the fact no waitqueue inserted in one of the two waitqueue heads we must enforce as wakeall, has wait->flags WQ_FLAG_EXCLUSIVE set. Signed-off-by: Andrea Arcangeli Cc: Dr. David Alan Gilbert Cc: Michael Ellerman Cc: Shuah Khan Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/wait.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 272d932..052e026 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -106,10 +106,9 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) } EXPORT_SYMBOL_GPL(__wake_up_locked); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr, - void *key) +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) { - __wake_up_common(q, mode, nr, 0, key); + __wake_up_common(q, mode, 1, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); @@ -284,7 +283,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_locked_key(q, mode, 1, key); + __wake_up_locked_key(q, mode, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); -- cgit v1.1 From 21199f27b430576552b26210b3194a363d7f05cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2015 16:10:40 +0200 Subject: locking/lockdep: Fix hlock->pin_count reset on lock stack rebuilds Various people reported hitting the "unpinning an unpinned lock" warning. As it turns out there are 2 places where we take a lock out of the middle of a stack, and in those cases it would fail to preserve the pin_count when rebuilding the lock stack. Reported-by: Sasha Levin Reported-by: Tim Spriggs Tested-by: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: davej@codemonkey.org.uk Link: http://lkml.kernel.org/r/20150916141040.GA11639@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8acfbf7..4e49cc4 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3068,7 +3068,7 @@ static int __lock_is_held(struct lockdep_map *lock); static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, struct lockdep_map *nest_lock, unsigned long ip, - int references) + int references, int pin_count) { struct task_struct *curr = current; struct lock_class *class = NULL; @@ -3157,7 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->waittime_stamp = 0; hlock->holdtime_stamp = lockstat_clock(); #endif - hlock->pin_count = 0; + hlock->pin_count = pin_count; if (check && !mark_irqflags(curr, hlock)) return 0; @@ -3343,7 +3343,7 @@ found_it: hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references)) + hlock->references, hlock->pin_count)) return 0; } @@ -3433,7 +3433,7 @@ found_it: hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references)) + hlock->references, hlock->pin_count)) return 0; } @@ -3583,7 +3583,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, current->lockdep_recursion = 1; trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), nest_lock, ip, 0); + irqs_disabled_flags(flags), nest_lock, ip, 0, 0); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } -- cgit v1.1 From 95c2b17534654829db428f11bcf4297c059a2a7e Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sat, 26 Sep 2015 12:23:56 +0100 Subject: genirq: Fix race in register_irq_proc() Per-IRQ directories in procfs are created only when a handler is first added to the irqdesc, not when the irqdesc is created. In the case of a shared IRQ, multiple tasks can race to create a directory. This race condition seems to have been present forever, but is easier to hit with async probing. Signed-off-by: Ben Hutchings Link: http://lkml.kernel.org/r/1443266636.2004.2.camel@decadent.org.uk Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- kernel/irq/proc.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index e3a8c95..a50ddc9 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "internals.h" @@ -323,18 +324,29 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) void register_irq_proc(unsigned int irq, struct irq_desc *desc) { + static DEFINE_MUTEX(register_lock); char name [MAX_NAMELEN]; - if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) + if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip)) return; + /* + * irq directories are registered only when a handler is + * added, not when the descriptor is created, so multiple + * tasks might try to register at the same time. + */ + mutex_lock(®ister_lock); + + if (desc->dir) + goto out_unlock; + memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ desc->dir = proc_mkdir(name, root_irq_dir); if (!desc->dir) - return; + goto out_unlock; #ifdef CONFIG_SMP /* create /proc/irq//smp_affinity */ @@ -355,6 +367,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) proc_create_data("spurious", 0444, desc->dir, &irq_spurious_proc_fops, (void *)(long)irq); + +out_unlock: + mutex_unlock(®ister_lock); } void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) -- cgit v1.1 From 67dfae0cd72fec5cd158b6e5fb1647b7dbe0834c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 14 Sep 2015 18:05:20 -0700 Subject: clocksource: Fix abs() usage w/ 64bit values This patch fixes one cases where abs() was being used with 64-bit nanosecond values, where the result may be capped at 32-bits. This potentially could cause watchdog false negatives on 32-bit systems, so this patch addresses the issue by using abs64(). Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Richard Cochran Cc: Ingo Molnar Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1442279124-7309-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 841b72f..3a38775 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -217,7 +217,7 @@ static void clocksource_watchdog(unsigned long data) continue; /* Check the deviation from the watchdog clocksource. */ - if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { + if (abs64(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n", cs->name); pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", -- cgit v1.1 From 95913d97914f44db2b81271c2e2ebd4d2ac2df83 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Sep 2015 14:45:09 +0200 Subject: sched/core: Fix TASK_DEAD race in finish_task_switch() So the problem this patch is trying to address is as follows: CPU0 CPU1 context_switch(A, B) ttwu(A) LOCK A->pi_lock A->on_cpu == 0 finish_task_switch(A) prev_state = A->state <-. WMB | A->on_cpu = 0; | UNLOCK rq0->lock | | context_switch(C, A) `-- A->state = TASK_DEAD prev_state == TASK_DEAD put_task_struct(A) context_switch(A, C) finish_task_switch(A) A->state == TASK_DEAD put_task_struct(A) The argument being that the WMB will allow the load of A->state on CPU0 to cross over and observe CPU1's store of A->state, which will then result in a double-drop and use-after-free. Now the comment states (and this was true once upon a long time ago) that we need to observe A->state while holding rq->lock because that will order us against the wakeup; however the wakeup will not in fact acquire (that) rq->lock; it takes A->pi_lock these days. We can obviously fix this by upgrading the WMB to an MB, but that is expensive, so we'd rather avoid that. The alternative this patch takes is: smp_store_release(&A->on_cpu, 0), which avoids the MB on some archs, but not important ones like ARM. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: # v3.1+ Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: manfred@colorfullife.com Cc: will.deacon@arm.com Fixes: e4a52bcb9a18 ("sched: Remove rq->lock from the first half of ttwu()") Link: http://lkml.kernel.org/r/20150929124509.GG3816@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++----- kernel/sched/sched.h | 5 +++-- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6159531..10a8faa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2517,11 +2517,11 @@ static struct rq *finish_task_switch(struct task_struct *prev) * If a task dies, then it sets TASK_DEAD in tsk->state and calls * schedule one last time. The schedule call will never return, and * the scheduled task must drop that reference. - * The test for TASK_DEAD must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul + * + * We must observe prev->state before clearing prev->on_cpu (in + * finish_lock_switch), otherwise a concurrent wakeup can get prev + * running on another CPU and we could rave with its RUNNING -> DEAD + * transition, resulting in a double drop. */ prev_state = prev->state; vtime_task_switch(prev); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 68cda11..6d2a119 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1078,9 +1078,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. + * + * Pairs with the control dependency and rmb in try_to_wake_up(). */ - smp_wmb(); - prev->on_cpu = 0; + smp_store_release(&prev->on_cpu, 0); #endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ -- cgit v1.1