From e93ad19d05648397ef3bcb838d26aec06c245dc0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 Jan 2016 12:18:41 -0500
Subject: cpuset: make mm migration asynchronous

If "cpuset.memory_migrate" is set, when a process is moved from one
cpuset to another with a different memory node mask, pages in used by
the process are migrated to the new set of nodes.  This was performed
synchronously in the ->attach() callback, which is synchronized
against process management.  Recently, the synchronization was changed
from per-process rwsem to global percpu rwsem for simplicity and
optimization.

Combined with the synchronous mm migration, this led to deadlocks
because mm migration could schedule a work item which may in turn try
to create a new worker blocking on the process management lock held
from cgroup process migration path.

This heavy an operation shouldn't be performed synchronously from that
deep inside cgroup migration in the first place.  This patch punts the
actual migration to an ordered workqueue and updates cgroup process
migration and cpuset config update paths to flush the workqueue after
all locks are released.  This way, the operations still seem
synchronous to userland without entangling mm migration with process
management synchronization.  CPU hotplug can also invoke mm migration
but there's no reason for it to wait for mm migrations and thus
doesn't synchronize against their completions.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: stable@vger.kernel.org # v4.4+
---
 kernel/cgroup.c |  2 ++
 kernel/cpuset.c | 71 +++++++++++++++++++++++++++++++++++++++------------------
 2 files changed, 51 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c03a640..88abd4d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -58,6 +58,7 @@
 #include <linux/kthread.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
+#include <linux/cpuset.h>
 #include <net/sock.h>
 
 /*
@@ -2739,6 +2740,7 @@ out_unlock_rcu:
 out_unlock_threadgroup:
 	percpu_up_write(&cgroup_threadgroup_rwsem);
 	cgroup_kn_unlock(of->kn);
+	cpuset_post_attach_flush();
 	return ret ?: nbytes;
 }
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3e945fc..41989ab 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -287,6 +287,8 @@ static struct cpuset top_cpuset = {
 static DEFINE_MUTEX(cpuset_mutex);
 static DEFINE_SPINLOCK(callback_lock);
 
+static struct workqueue_struct *cpuset_migrate_mm_wq;
+
 /*
  * CPU / memory hotplug is handled asynchronously.
  */
@@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 }
 
 /*
- * cpuset_migrate_mm
- *
- *    Migrate memory region from one set of nodes to another.
- *
- *    Temporarilly set tasks mems_allowed to target nodes of migration,
- *    so that the migration code can allocate pages on these nodes.
- *
- *    While the mm_struct we are migrating is typically from some
- *    other task, the task_struct mems_allowed that we are hacking
- *    is for our current task, which must allocate new pages for that
- *    migrating memory region.
+ * Migrate memory region from one set of nodes to another.  This is
+ * performed asynchronously as it can be called from process migration path
+ * holding locks involved in process management.  All mm migrations are
+ * performed in the queued order and can be waited for by flushing
+ * cpuset_migrate_mm_wq.
  */
 
+struct cpuset_migrate_mm_work {
+	struct work_struct	work;
+	struct mm_struct	*mm;
+	nodemask_t		from;
+	nodemask_t		to;
+};
+
+static void cpuset_migrate_mm_workfn(struct work_struct *work)
+{
+	struct cpuset_migrate_mm_work *mwork =
+		container_of(work, struct cpuset_migrate_mm_work, work);
+
+	/* on a wq worker, no need to worry about %current's mems_allowed */
+	do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
+	mmput(mwork->mm);
+	kfree(mwork);
+}
+
 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 							const nodemask_t *to)
 {
-	struct task_struct *tsk = current;
-
-	tsk->mems_allowed = *to;
+	struct cpuset_migrate_mm_work *mwork;
 
-	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+	mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
+	if (mwork) {
+		mwork->mm = mm;
+		mwork->from = *from;
+		mwork->to = *to;
+		INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
+		queue_work(cpuset_migrate_mm_wq, &mwork->work);
+	} else {
+		mmput(mm);
+	}
+}
 
-	rcu_read_lock();
-	guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
-	rcu_read_unlock();
+void cpuset_post_attach_flush(void)
+{
+	flush_workqueue(cpuset_migrate_mm_wq);
 }
 
 /*
@@ -1097,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs)
 		mpol_rebind_mm(mm, &cs->mems_allowed);
 		if (migrate)
 			cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
-		mmput(mm);
+		else
+			mmput(mm);
 	}
 	css_task_iter_end(&it);
 
@@ -1545,11 +1568,11 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 			 * @old_mems_allowed is the right nodesets that we
 			 * migrate mm from.
 			 */
-			if (is_memory_migrate(cs)) {
+			if (is_memory_migrate(cs))
 				cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
 						  &cpuset_attach_nodemask_to);
-			}
-			mmput(mm);
+			else
+				mmput(mm);
 		}
 	}
 
@@ -1714,6 +1737,7 @@ out_unlock:
 	mutex_unlock(&cpuset_mutex);
 	kernfs_unbreak_active_protection(of->kn);
 	css_put(&cs->css);
+	flush_workqueue(cpuset_migrate_mm_wq);
 	return retval ?: nbytes;
 }
 
@@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void)
 	top_cpuset.effective_mems = node_states[N_MEMORY];
 
 	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+
+	cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
+	BUG_ON(!cpuset_migrate_mm_wq);
 }
 
 /**
-- 
cgit v1.1


From aa226ff4a1ce79f229c6b7a4c0a14e17fececd01 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 21 Jan 2016 15:31:11 -0500
Subject: cgroup: make sure a parent css isn't offlined before its children

There are three subsystem callbacks in css shutdown path -
css_offline(), css_released() and css_free().  Except for
css_released(), cgroup core didn't guarantee the order of invocation.
css_offline() or css_free() could be called on a parent css before its
children.  This behavior is unexpected and led to bugs in cpu and
memory controller.

This patch updates offline path so that a parent css is never offlined
before its children.  Each css keeps online_cnt which reaches zero iff
itself and all its children are offline and offline_css() is invoked
only after online_cnt reaches zero.

This fixes the memory controller bug and allows the fix for cpu
controller.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reported-by: Brian Christiansen <brian.o.christiansen@gmail.com>
Link: http://lkml.kernel.org/g/5698A023.9070703@de.ibm.com
Link: http://lkml.kernel.org/g/CAKB58ikDkzc8REt31WBkD99+hxNzjK4+FBmhkgS+NVrC9vjMSg@mail.gmail.com
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
---
 kernel/cgroup.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 88abd4d..d015877 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4760,6 +4760,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 	INIT_LIST_HEAD(&css->sibling);
 	INIT_LIST_HEAD(&css->children);
 	css->serial_nr = css_serial_nr_next++;
+	atomic_set(&css->online_cnt, 0);
 
 	if (cgroup_parent(cgrp)) {
 		css->parent = cgroup_css(cgroup_parent(cgrp), ss);
@@ -4782,6 +4783,10 @@ static int online_css(struct cgroup_subsys_state *css)
 	if (!ret) {
 		css->flags |= CSS_ONLINE;
 		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
+
+		atomic_inc(&css->online_cnt);
+		if (css->parent)
+			atomic_inc(&css->parent->online_cnt);
 	}
 	return ret;
 }
@@ -5019,10 +5024,15 @@ static void css_killed_work_fn(struct work_struct *work)
 		container_of(work, struct cgroup_subsys_state, destroy_work);
 
 	mutex_lock(&cgroup_mutex);
-	offline_css(css);
-	mutex_unlock(&cgroup_mutex);
 
-	css_put(css);
+	do {
+		offline_css(css);
+		css_put(css);
+		/* @css can't go away while we're holding cgroup_mutex */
+		css = css->parent;
+	} while (css && atomic_dec_and_test(&css->online_cnt));
+
+	mutex_unlock(&cgroup_mutex);
 }
 
 /* css kill confirmation processing requires process context, bounce */
@@ -5031,8 +5041,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
 	struct cgroup_subsys_state *css =
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 
-	INIT_WORK(&css->destroy_work, css_killed_work_fn);
-	queue_work(cgroup_destroy_wq, &css->destroy_work);
+	if (atomic_dec_and_test(&css->online_cnt)) {
+		INIT_WORK(&css->destroy_work, css_killed_work_fn);
+		queue_work(cgroup_destroy_wq, &css->destroy_work);
+	}
 }
 
 /**
-- 
cgit v1.1


From 8bb5ef79bc0f4016ecf79e8dce6096a3c63603e4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 21 Jan 2016 15:32:15 -0500
Subject: cgroup: make sure a parent css isn't freed before its children

There are three subsystem callbacks in css shutdown path -
css_offline(), css_released() and css_free().  Except for
css_released(), cgroup core didn't guarantee the order of invocation.
css_offline() or css_free() could be called on a parent css before its
children.  This behavior is unexpected and led to bugs in cpu and
memory controller.

The previous patch updated ordering for css_offline() which fixes the
cpu controller issue.  While there currently isn't a known bug caused
by misordering of css_free() invocations, let's fix it too for
consistency.

css_free() ordering can be trivially fixed by moving putting of the
parent css below css_free() invocation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/cgroup.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d015877..d27904c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4657,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work)
 
 	if (ss) {
 		/* css free path */
+		struct cgroup_subsys_state *parent = css->parent;
 		int id = css->id;
 
-		if (css->parent)
-			css_put(css->parent);
-
 		ss->css_free(css);
 		cgroup_idr_remove(&ss->css_idr, id);
 		cgroup_put(cgrp);
+
+		if (parent)
+			css_put(parent);
 	} else {
 		/* cgroup free path */
 		atomic_dec(&cgrp->root->nr_cgrps);
-- 
cgit v1.1


From 23d11a58a9a60dcb52c8fc6494efce908b24c295 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Jan 2016 05:59:46 -0500
Subject: workqueue: skip flush dependency checks for legacy workqueues

fca839c00a12 ("workqueue: warn if memory reclaim tries to flush
!WQ_MEM_RECLAIM workqueue") implemented flush dependency warning which
triggers if a PF_MEMALLOC task or WQ_MEM_RECLAIM workqueue tries to
flush a !WQ_MEM_RECLAIM workquee.

This assumes that workqueues marked with WQ_MEM_RECLAIM sit in memory
reclaim path and making it depend on something which may need more
memory to make forward progress can lead to deadlocks.  Unfortunately,
workqueues created with the legacy create*_workqueue() interface
always have WQ_MEM_RECLAIM regardless of whether they are depended
upon memory reclaim or not.  These spurious WQ_MEM_RECLAIM markings
cause spurious triggering of the flush dependency checks.

  WARNING: CPU: 0 PID: 6 at kernel/workqueue.c:2361 check_flush_dependency+0x138/0x144()
  workqueue: WQ_MEM_RECLAIM deferwq:deferred_probe_work_func is flushing !WQ_MEM_RECLAIM events:lru_add_drain_per_cpu
  ...
  Workqueue: deferwq deferred_probe_work_func
  [<c0017acc>] (unwind_backtrace) from [<c0013134>] (show_stack+0x10/0x14)
  [<c0013134>] (show_stack) from [<c0245f18>] (dump_stack+0x94/0xd4)
  [<c0245f18>] (dump_stack) from [<c0026f9c>] (warn_slowpath_common+0x80/0xb0)
  [<c0026f9c>] (warn_slowpath_common) from [<c0026ffc>] (warn_slowpath_fmt+0x30/0x40)
  [<c0026ffc>] (warn_slowpath_fmt) from [<c00390b8>] (check_flush_dependency+0x138/0x144)
  [<c00390b8>] (check_flush_dependency) from [<c0039ca0>] (flush_work+0x50/0x15c)
  [<c0039ca0>] (flush_work) from [<c00c51b0>] (lru_add_drain_all+0x130/0x180)
  [<c00c51b0>] (lru_add_drain_all) from [<c00f728c>] (migrate_prep+0x8/0x10)
  [<c00f728c>] (migrate_prep) from [<c00bfbc4>] (alloc_contig_range+0xd8/0x338)
  [<c00bfbc4>] (alloc_contig_range) from [<c00f8f18>] (cma_alloc+0xe0/0x1ac)
  [<c00f8f18>] (cma_alloc) from [<c001cac4>] (__alloc_from_contiguous+0x38/0xd8)
  [<c001cac4>] (__alloc_from_contiguous) from [<c001ceb4>] (__dma_alloc+0x240/0x278)
  [<c001ceb4>] (__dma_alloc) from [<c001cf78>] (arm_dma_alloc+0x54/0x5c)
  [<c001cf78>] (arm_dma_alloc) from [<c0355ea4>] (dmam_alloc_coherent+0xc0/0xec)
  [<c0355ea4>] (dmam_alloc_coherent) from [<c039cc4c>] (ahci_port_start+0x150/0x1dc)
  [<c039cc4c>] (ahci_port_start) from [<c0384734>] (ata_host_start.part.3+0xc8/0x1c8)
  [<c0384734>] (ata_host_start.part.3) from [<c03898dc>] (ata_host_activate+0x50/0x148)
  [<c03898dc>] (ata_host_activate) from [<c039d558>] (ahci_host_activate+0x44/0x114)
  [<c039d558>] (ahci_host_activate) from [<c039f05c>] (ahci_platform_init_host+0x1d8/0x3c8)
  [<c039f05c>] (ahci_platform_init_host) from [<c039e6bc>] (tegra_ahci_probe+0x448/0x4e8)
  [<c039e6bc>] (tegra_ahci_probe) from [<c0347058>] (platform_drv_probe+0x50/0xac)
  [<c0347058>] (platform_drv_probe) from [<c03458cc>] (driver_probe_device+0x214/0x2c0)
  [<c03458cc>] (driver_probe_device) from [<c0343cc0>] (bus_for_each_drv+0x60/0x94)
  [<c0343cc0>] (bus_for_each_drv) from [<c03455d8>] (__device_attach+0xb0/0x114)
  [<c03455d8>] (__device_attach) from [<c0344ab8>] (bus_probe_device+0x84/0x8c)
  [<c0344ab8>] (bus_probe_device) from [<c0344f48>] (deferred_probe_work_func+0x68/0x98)
  [<c0344f48>] (deferred_probe_work_func) from [<c003b738>] (process_one_work+0x120/0x3f8)
  [<c003b738>] (process_one_work) from [<c003ba48>] (worker_thread+0x38/0x55c)
  [<c003ba48>] (worker_thread) from [<c0040f14>] (kthread+0xdc/0xf4)
  [<c0040f14>] (kthread) from [<c000f778>] (ret_from_fork+0x14/0x3c)

Fix it by marking workqueues created via create*_workqueue() with
__WQ_LEGACY and disabling flush dependency checks on them.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Thierry Reding <thierry.reding@gmail.com>
Link: http://lkml.kernel.org/g/20160126173843.GA11115@ulmo.nvidia.com
Fixes: fca839c00a12 ("workqueue: warn if memory reclaim tries to flush !WQ_MEM_RECLAIM workqueue")
---
 kernel/workqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 61a0264..dc7faad 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2355,7 +2355,8 @@ static void check_flush_dependency(struct workqueue_struct *target_wq,
 	WARN_ONCE(current->flags & PF_MEMALLOC,
 		  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf",
 		  current->pid, current->comm, target_wq->name, target_func);
-	WARN_ONCE(worker && (worker->current_pwq->wq->flags & WQ_MEM_RECLAIM),
+	WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
+			      (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
 		  "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf",
 		  worker->current_pwq->wq->name, worker->current_func,
 		  target_wq->name, target_func);
-- 
cgit v1.1


From 4355efbd80482a961cae849281a8ef866e53d55c Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@suse.com>
Date: Wed, 3 Feb 2016 16:55:26 +1030
Subject: modules: fix modparam async_probe request

Commit f2411da746985 ("driver-core: add driver module
asynchronous probe support") added async probe support,
in two forms:

  * in-kernel driver specification annotation
  * generic async_probe module parameter (modprobe foo async_probe)

To support the generic kernel parameter parse_args() was
extended via commit ecc8617053e0 ("module: add extra
argument for parse_params() callback") however commit
failed to f2411da746985 failed to add the required argument.

This causes a crash then whenever async_probe generic
module parameter is used. This was overlooked when the
form in which in-kernel async probe support was reworked
a bit... Fix this as originally intended.

Cc: Hannes Reinecke <hare@suse.de>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: stable@vger.kernel.org (4.2+)
Signed-off-by: Luis R. Rodriguez <mcgrof@suse.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> [minimized]
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 8358f46..2149f70 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3496,7 +3496,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 
 	/* Module is ready to execute: parsing args may do that. */
 	after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-				  -32768, 32767, NULL,
+				  -32768, 32767, mod,
 				  unknown_module_param_cb);
 	if (IS_ERR(after_dashes)) {
 		err = PTR_ERR(after_dashes);
-- 
cgit v1.1


From 2e7bac536106236104e9e339531ff0fcdb7b8147 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 3 Feb 2016 16:55:26 +1030
Subject: module: wrapper for symbol name.

This trivial wrapper adds clarity and makes the following patch
smaller.

Cc: stable@kernel.org
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 2149f70..1e79d81 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3627,6 +3627,11 @@ static inline int is_arm_mapping_symbol(const char *str)
 	       && (str[2] == '\0' || str[2] == '.');
 }
 
+static const char *symname(struct module *mod, unsigned int symnum)
+{
+	return mod->strtab + mod->symtab[symnum].st_name;
+}
+
 static const char *get_ksymbol(struct module *mod,
 			       unsigned long addr,
 			       unsigned long *size,
@@ -3649,15 +3654,15 @@ static const char *get_ksymbol(struct module *mod,
 
 		/* We ignore unnamed symbols: they're uninformative
 		 * and inserted at a whim. */
+		if (*symname(mod, i) == '\0'
+		    || is_arm_mapping_symbol(symname(mod, i)))
+			continue;
+
 		if (mod->symtab[i].st_value <= addr
-		    && mod->symtab[i].st_value > mod->symtab[best].st_value
-		    && *(mod->strtab + mod->symtab[i].st_name) != '\0'
-		    && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
+		    && mod->symtab[i].st_value > mod->symtab[best].st_value)
 			best = i;
 		if (mod->symtab[i].st_value > addr
-		    && mod->symtab[i].st_value < nextval
-		    && *(mod->strtab + mod->symtab[i].st_name) != '\0'
-		    && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
+		    && mod->symtab[i].st_value < nextval)
 			nextval = mod->symtab[i].st_value;
 	}
 
@@ -3668,7 +3673,7 @@ static const char *get_ksymbol(struct module *mod,
 		*size = nextval - mod->symtab[best].st_value;
 	if (offset)
 		*offset = addr - mod->symtab[best].st_value;
-	return mod->strtab + mod->symtab[best].st_name;
+	return symname(mod, best);
 }
 
 /* For kallsyms to ask for address resolution.  NULL means not found.  Careful
@@ -3763,8 +3768,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 		if (symnum < mod->num_symtab) {
 			*value = mod->symtab[symnum].st_value;
 			*type = mod->symtab[symnum].st_info;
-			strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
-				KSYM_NAME_LEN);
+			strlcpy(name, symname(mod, symnum), KSYM_NAME_LEN);
 			strlcpy(module_name, mod->name, MODULE_NAME_LEN);
 			*exported = is_exported(name, *value, mod);
 			preempt_enable();
@@ -3781,7 +3785,7 @@ static unsigned long mod_find_symname(struct module *mod, const char *name)
 	unsigned int i;
 
 	for (i = 0; i < mod->num_symtab; i++)
-		if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 &&
+		if (strcmp(name, symname(mod, i)) == 0 &&
 		    mod->symtab[i].st_info != 'U')
 			return mod->symtab[i].st_value;
 	return 0;
@@ -3825,7 +3829,7 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
 		if (mod->state == MODULE_STATE_UNFORMED)
 			continue;
 		for (i = 0; i < mod->num_symtab; i++) {
-			ret = fn(data, mod->strtab + mod->symtab[i].st_name,
+			ret = fn(data, symname(mod, i),
 				 mod, mod->symtab[i].st_value);
 			if (ret != 0)
 				return ret;
-- 
cgit v1.1


From 8244062ef1e54502ef55f54cced659913f244c3e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 3 Feb 2016 16:55:26 +1030
Subject: modules: fix longstanding /proc/kallsyms vs module insertion race.

For CONFIG_KALLSYMS, we keep two symbol tables and two string tables.
There's one full copy, marked SHF_ALLOC and laid out at the end of the
module's init section.  There's also a cut-down version that only
contains core symbols and strings, and lives in the module's core
section.

After module init (and before we free the module memory), we switch
the mod->symtab, mod->num_symtab and mod->strtab to point to the core
versions.  We do this under the module_mutex.

However, kallsyms doesn't take the module_mutex: it uses
preempt_disable() and rcu tricks to walk through the modules, because
it's used in the oops path.  It's also used in /proc/kallsyms.
There's nothing atomic about the change of these variables, so we can
get the old (larger!) num_symtab and the new symtab pointer; in fact
this is what I saw when trying to reproduce.

By grouping these variables together, we can use a
carefully-dereferenced pointer to ensure we always get one or the
other (the free of the module init section is already done in an RCU
callback, so that's safe).  We allocate the init one at the end of the
module init section, and keep the core one inside the struct module
itself (it could also have been allocated at the end of the module
core, but that's probably overkill).

Reported-by: Weilong Chen <chenweilong@huawei.com>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111541
Cc: stable@kernel.org
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 112 ++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 69 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 1e79d81..9537da3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -303,6 +303,9 @@ struct load_info {
 	struct _ddebug *debug;
 	unsigned int num_debug;
 	bool sig_ok;
+#ifdef CONFIG_KALLSYMS
+	unsigned long mod_kallsyms_init_off;
+#endif
 	struct {
 		unsigned int sym, str, mod, vers, info, pcpu;
 	} index;
@@ -2480,10 +2483,21 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 	strsect->sh_flags |= SHF_ALLOC;
 	strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
 					 info->index.str) | INIT_OFFSET_MASK;
-	mod->init_layout.size = debug_align(mod->init_layout.size);
 	pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
+
+	/* We'll tack temporary mod_kallsyms on the end. */
+	mod->init_layout.size = ALIGN(mod->init_layout.size,
+				      __alignof__(struct mod_kallsyms));
+	info->mod_kallsyms_init_off = mod->init_layout.size;
+	mod->init_layout.size += sizeof(struct mod_kallsyms);
+	mod->init_layout.size = debug_align(mod->init_layout.size);
 }
 
+/*
+ * We use the full symtab and strtab which layout_symtab arranged to
+ * be appended to the init section.  Later we switch to the cut-down
+ * core-only ones.
+ */
 static void add_kallsyms(struct module *mod, const struct load_info *info)
 {
 	unsigned int i, ndst;
@@ -2492,29 +2506,34 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
 	char *s;
 	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
 
-	mod->symtab = (void *)symsec->sh_addr;
-	mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+	/* Set up to point into init section. */
+	mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
+
+	mod->kallsyms->symtab = (void *)symsec->sh_addr;
+	mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
 	/* Make sure we get permanent strtab: don't use info->strtab. */
-	mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
+	mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
 
 	/* Set types up while we still have access to sections. */
-	for (i = 0; i < mod->num_symtab; i++)
-		mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
-
-	mod->core_symtab = dst = mod->core_layout.base + info->symoffs;
-	mod->core_strtab = s = mod->core_layout.base + info->stroffs;
-	src = mod->symtab;
-	for (ndst = i = 0; i < mod->num_symtab; i++) {
+	for (i = 0; i < mod->kallsyms->num_symtab; i++)
+		mod->kallsyms->symtab[i].st_info
+			= elf_type(&mod->kallsyms->symtab[i], info);
+
+	/* Now populate the cut down core kallsyms for after init. */
+	mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
+	mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
+	src = mod->kallsyms->symtab;
+	for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
 		if (i == 0 ||
 		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
 				   info->index.pcpu)) {
 			dst[ndst] = src[i];
-			dst[ndst++].st_name = s - mod->core_strtab;
-			s += strlcpy(s, &mod->strtab[src[i].st_name],
+			dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
+			s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
 				     KSYM_NAME_LEN) + 1;
 		}
 	}
-	mod->core_num_syms = ndst;
+	mod->core_kallsyms.num_symtab = ndst;
 }
 #else
 static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -3263,9 +3282,8 @@ static noinline int do_init_module(struct module *mod)
 	module_put(mod);
 	trim_init_extable(mod);
 #ifdef CONFIG_KALLSYMS
-	mod->num_symtab = mod->core_num_syms;
-	mod->symtab = mod->core_symtab;
-	mod->strtab = mod->core_strtab;
+	/* Switch to core kallsyms now init is done: kallsyms may be walking! */
+	rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
 #endif
 	mod_tree_remove_init(mod);
 	disable_ro_nx(&mod->init_layout);
@@ -3627,9 +3645,9 @@ static inline int is_arm_mapping_symbol(const char *str)
 	       && (str[2] == '\0' || str[2] == '.');
 }
 
-static const char *symname(struct module *mod, unsigned int symnum)
+static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum)
 {
-	return mod->strtab + mod->symtab[symnum].st_name;
+	return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
 }
 
 static const char *get_ksymbol(struct module *mod,
@@ -3639,6 +3657,7 @@ static const char *get_ksymbol(struct module *mod,
 {
 	unsigned int i, best = 0;
 	unsigned long nextval;
+	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
 
 	/* At worse, next value is at end of module */
 	if (within_module_init(addr, mod))
@@ -3648,32 +3667,32 @@ static const char *get_ksymbol(struct module *mod,
 
 	/* Scan for closest preceding symbol, and next symbol. (ELF
 	   starts real symbols at 1). */
-	for (i = 1; i < mod->num_symtab; i++) {
-		if (mod->symtab[i].st_shndx == SHN_UNDEF)
+	for (i = 1; i < kallsyms->num_symtab; i++) {
+		if (kallsyms->symtab[i].st_shndx == SHN_UNDEF)
 			continue;
 
 		/* We ignore unnamed symbols: they're uninformative
 		 * and inserted at a whim. */
-		if (*symname(mod, i) == '\0'
-		    || is_arm_mapping_symbol(symname(mod, i)))
+		if (*symname(kallsyms, i) == '\0'
+		    || is_arm_mapping_symbol(symname(kallsyms, i)))
 			continue;
 
-		if (mod->symtab[i].st_value <= addr
-		    && mod->symtab[i].st_value > mod->symtab[best].st_value)
+		if (kallsyms->symtab[i].st_value <= addr
+		    && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value)
 			best = i;
-		if (mod->symtab[i].st_value > addr
-		    && mod->symtab[i].st_value < nextval)
-			nextval = mod->symtab[i].st_value;
+		if (kallsyms->symtab[i].st_value > addr
+		    && kallsyms->symtab[i].st_value < nextval)
+			nextval = kallsyms->symtab[i].st_value;
 	}
 
 	if (!best)
 		return NULL;
 
 	if (size)
-		*size = nextval - mod->symtab[best].st_value;
+		*size = nextval - kallsyms->symtab[best].st_value;
 	if (offset)
-		*offset = addr - mod->symtab[best].st_value;
-	return symname(mod, best);
+		*offset = addr - kallsyms->symtab[best].st_value;
+	return symname(kallsyms, best);
 }
 
 /* For kallsyms to ask for address resolution.  NULL means not found.  Careful
@@ -3763,18 +3782,21 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 
 	preempt_disable();
 	list_for_each_entry_rcu(mod, &modules, list) {
+		struct mod_kallsyms *kallsyms;
+
 		if (mod->state == MODULE_STATE_UNFORMED)
 			continue;
-		if (symnum < mod->num_symtab) {
-			*value = mod->symtab[symnum].st_value;
-			*type = mod->symtab[symnum].st_info;
-			strlcpy(name, symname(mod, symnum), KSYM_NAME_LEN);
+		kallsyms = rcu_dereference_sched(mod->kallsyms);
+		if (symnum < kallsyms->num_symtab) {
+			*value = kallsyms->symtab[symnum].st_value;
+			*type = kallsyms->symtab[symnum].st_info;
+			strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN);
 			strlcpy(module_name, mod->name, MODULE_NAME_LEN);
 			*exported = is_exported(name, *value, mod);
 			preempt_enable();
 			return 0;
 		}
-		symnum -= mod->num_symtab;
+		symnum -= kallsyms->num_symtab;
 	}
 	preempt_enable();
 	return -ERANGE;
@@ -3783,11 +3805,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 static unsigned long mod_find_symname(struct module *mod, const char *name)
 {
 	unsigned int i;
+	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
 
-	for (i = 0; i < mod->num_symtab; i++)
-		if (strcmp(name, symname(mod, i)) == 0 &&
-		    mod->symtab[i].st_info != 'U')
-			return mod->symtab[i].st_value;
+	for (i = 0; i < kallsyms->num_symtab; i++)
+		if (strcmp(name, symname(kallsyms, i)) == 0 &&
+		    kallsyms->symtab[i].st_info != 'U')
+			return kallsyms->symtab[i].st_value;
 	return 0;
 }
 
@@ -3826,11 +3849,14 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
 	module_assert_mutex();
 
 	list_for_each_entry(mod, &modules, list) {
+		/* We hold module_mutex: no need for rcu_dereference_sched */
+		struct mod_kallsyms *kallsyms = mod->kallsyms;
+
 		if (mod->state == MODULE_STATE_UNFORMED)
 			continue;
-		for (i = 0; i < mod->num_symtab; i++) {
-			ret = fn(data, symname(mod, i),
-				 mod, mod->symtab[i].st_value);
+		for (i = 0; i < kallsyms->num_symtab; i++) {
+			ret = fn(data, symname(kallsyms, i),
+				 mod, kallsyms->symtab[i].st_value);
 			if (ret != 0)
 				return ret;
 		}
-- 
cgit v1.1


From 823dd3224a07f618d652a7743c9603222d019de3 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Fri, 5 Feb 2016 15:36:05 -0800
Subject: signals: avoid random wakeups in sigsuspend()

A random wakeup can get us out of sigsuspend() without TIF_SIGPENDING
being set.

Avoid that by making sure we were signaled, like sys_pause() does.

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index f3f1f7a..0508544 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3508,8 +3508,10 @@ static int sigsuspend(sigset_t *set)
 	current->saved_sigmask = current->blocked;
 	set_current_blocked(set);
 
-	__set_current_state(TASK_INTERRUPTIBLE);
-	schedule();
+	while (!signal_pending(current)) {
+		__set_current_state(TASK_INTERRUPTIBLE);
+		schedule();
+	}
 	set_restore_sigmask();
 	return -ERESTARTNOHAND;
 }
-- 
cgit v1.1


From 8a5fd56431fe1682e870bd6ab0c276e74befbeb9 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Thu, 4 Feb 2016 14:40:40 +0100
Subject: locking/lockdep: Fix stack trace caching logic

check_prev_add() caches saved stack trace in static trace variable
to avoid duplicate save_trace() calls in dependencies involving trylocks.
But that caching logic contains a bug. We may not save trace on first
iteration due to early return from check_prev_add(). Then on the
second iteration when we actually need the trace we don't save it
because we think that we've already saved it.

Let check_prev_add() itself control when stack is saved.

There is another bug. Trace variable is protected by graph lock.
But we can temporary release graph lock during printing.

Fix this by invalidating cached stack trace when we release graph lock.

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: glider@google.com
Cc: kcc@google.com
Cc: peter@hurleysoftware.com
Cc: sasha.levin@oracle.com
Link: http://lkml.kernel.org/r/1454593240-121647-1-git-send-email-dvyukov@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 60ace56..c7710e4 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1822,7 +1822,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
  */
 static int
 check_prev_add(struct task_struct *curr, struct held_lock *prev,
-	       struct held_lock *next, int distance, int trylock_loop)
+	       struct held_lock *next, int distance, int *stack_saved)
 {
 	struct lock_list *entry;
 	int ret;
@@ -1883,8 +1883,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 		}
 	}
 
-	if (!trylock_loop && !save_trace(&trace))
-		return 0;
+	if (!*stack_saved) {
+		if (!save_trace(&trace))
+			return 0;
+		*stack_saved = 1;
+	}
 
 	/*
 	 * Ok, all validations passed, add the new lock
@@ -1907,6 +1910,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	 * Debugging printouts:
 	 */
 	if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
+		/* We drop graph lock, so another thread can overwrite trace. */
+		*stack_saved = 0;
 		graph_unlock();
 		printk("\n new dependency: ");
 		print_lock_name(hlock_class(prev));
@@ -1929,7 +1934,7 @@ static int
 check_prevs_add(struct task_struct *curr, struct held_lock *next)
 {
 	int depth = curr->lockdep_depth;
-	int trylock_loop = 0;
+	int stack_saved = 0;
 	struct held_lock *hlock;
 
 	/*
@@ -1956,7 +1961,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		 */
 		if (hlock->read != 2 && hlock->check) {
 			if (!check_prev_add(curr, hlock, next,
-						distance, trylock_loop))
+						distance, &stack_saved))
 				return 0;
 			/*
 			 * Stop after the first non-trylock entry,
@@ -1979,7 +1984,6 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		if (curr->held_locks[depth].irq_context !=
 				curr->held_locks[depth-1].irq_context)
 			break;
-		trylock_loop = 1;
 	}
 	return 1;
 out_bug:
-- 
cgit v1.1


From 041bd12e272c53a35c54c13875839bcb98c999ce Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Feb 2016 16:11:26 -0500
Subject: Revert "workqueue: make sure delayed work run in local cpu"

This reverts commit 874bbfe600a660cba9c776b3957b1ce393151b76.

Workqueue used to implicity guarantee that work items queued without
explicit CPU specified are put on the local CPU.  Recent changes in
timer broke the guarantee and led to vmstat breakage which was fixed
by 176bed1de5bf ("vmstat: explicitly schedule per-cpu work on the CPU
we need it to run on").

vmstat is the most likely to expose the issue and it's quite possible
that there are other similar problems which are a lot more difficult
to trigger.  As a preventive measure, 874bbfe600a6 ("workqueue: make
sure delayed work run in local cpu") was applied to restore the local
CPU guarnatee.  Unfortunately, the change exposed a bug in timer code
which got fixed by 22b886dd1018 ("timers: Use proper base migration in
add_timer_on()").  Due to code restructuring, the commit couldn't be
backported beyond certain point and stable kernels which only had
874bbfe600a6 started crashing.

The local CPU guarantee was accidental more than anything else and we
want to get rid of it anyway.  As, with the vmstat case fixed,
874bbfe600a6 is causing more problems than it's fixing, it has been
decided to take the chance and officially break the guarantee by
reverting the commit.  A debug feature will be added to force foreign
CPU assignment to expose cases relying on the guarantee and fixes for
the individual cases will be backported to stable as necessary.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: 874bbfe600a6 ("workqueue: make sure delayed work run in local cpu")
Link: http://lkml.kernel.org/g/20160120211926.GJ10810@quack.suse.cz
Cc: stable@vger.kernel.org
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Cc: Daniel Bilik <daniel.bilik@neosystem.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Shaohua Li <shli@fb.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Daniel Bilik <daniel.bilik@neosystem.cz>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Michal Hocko <mhocko@kernel.org>
---
 kernel/workqueue.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dc7faad..5e63d3b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1464,13 +1464,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 	timer_stats_timer_set_start_info(&dwork->timer);
 
 	dwork->wq = wq;
-	/* timer isn't guaranteed to run in this cpu, record earlier */
-	if (cpu == WORK_CPU_UNBOUND)
-		cpu = raw_smp_processor_id();
 	dwork->cpu = cpu;
 	timer->expires = jiffies + delay;
 
-	add_timer_on(timer, cpu);
+	if (unlikely(cpu != WORK_CPU_UNBOUND))
+		add_timer_on(timer, cpu);
+	else
+		add_timer(timer);
 }
 
 /**
-- 
cgit v1.1


From ef557180447fa9a7a0affd3abb21ecceb4b5e125 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Tue, 9 Feb 2016 17:59:38 -0500
Subject: workqueue: schedule WORK_CPU_UNBOUND work on wq_unbound_cpumask CPUs

WORK_CPU_UNBOUND work items queued to a bound workqueue always run
locally.  This is a good thing normally, but not when the user has
asked us to keep unbound work away from certain CPUs.  Round robin
these to wq_unbound_cpumask CPUs instead, as perturbation avoidance
trumps performance.

tj: Cosmetic and comment changes.  WARN_ON_ONCE() dropped from empty
    (wq_unbound_cpumask AND cpu_online_mask).  If we want that, it
    should be done when config changes.

Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5e63d3b..0547746 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -301,7 +301,11 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
-static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
+/* PL: allowable cpus for unbound wqs and work items */
+static cpumask_var_t wq_unbound_cpumask;
+
+/* CPU where unbound work was last round robin scheduled from this CPU */
+static DEFINE_PER_CPU(int, wq_rr_cpu_last);
 
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
@@ -1298,6 +1302,32 @@ static bool is_chained_work(struct workqueue_struct *wq)
 	return worker && worker->current_pwq->wq == wq;
 }
 
+/*
+ * When queueing an unbound work item to a wq, prefer local CPU if allowed
+ * by wq_unbound_cpumask.  Otherwise, round robin among the allowed ones to
+ * avoid perturbing sensitive tasks.
+ */
+static int wq_select_unbound_cpu(int cpu)
+{
+	int new_cpu;
+
+	if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
+		return cpu;
+	if (cpumask_empty(wq_unbound_cpumask))
+		return cpu;
+
+	new_cpu = __this_cpu_read(wq_rr_cpu_last);
+	new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
+	if (unlikely(new_cpu >= nr_cpu_ids)) {
+		new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
+		if (unlikely(new_cpu >= nr_cpu_ids))
+			return cpu;
+	}
+	__this_cpu_write(wq_rr_cpu_last, new_cpu);
+
+	return new_cpu;
+}
+
 static void __queue_work(int cpu, struct workqueue_struct *wq,
 			 struct work_struct *work)
 {
@@ -1323,7 +1353,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 		return;
 retry:
 	if (req_cpu == WORK_CPU_UNBOUND)
-		cpu = raw_smp_processor_id();
+		cpu = wq_select_unbound_cpu(raw_smp_processor_id());
 
 	/* pwq which will be used unless @work is executing elsewhere */
 	if (!(wq->flags & WQ_UNBOUND))
-- 
cgit v1.1


From f303fccb82928790ec58eea82722bd5c54d300b3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Feb 2016 17:59:38 -0500
Subject: workqueue: implement "workqueue.debug_force_rr_cpu" debug feature

Workqueue used to guarantee local execution for work items queued
without explicit target CPU.  The guarantee is gone now which can
break some usages in subtle ways.  To flush out those cases, this
patch implements a debug feature which forces round-robin CPU
selection for all such work items.

The debug feature defaults to off and can be enabled with a kernel
parameter.  The default can be flipped with a debug config option.

If you hit this commit during bisection, please refer to 041bd12e272c
("Revert "workqueue: make sure delayed work run in local cpu"") for
more information and ping me.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0547746..51d77e7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -307,6 +307,18 @@ static cpumask_var_t wq_unbound_cpumask;
 /* CPU where unbound work was last round robin scheduled from this CPU */
 static DEFINE_PER_CPU(int, wq_rr_cpu_last);
 
+/*
+ * Local execution of unbound work items is no longer guaranteed.  The
+ * following always forces round-robin CPU selection on unbound work items
+ * to uncover usages which depend on it.
+ */
+#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
+static bool wq_debug_force_rr_cpu = true;
+#else
+static bool wq_debug_force_rr_cpu = false;
+#endif
+module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
+
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
 				     cpu_worker_pools);
@@ -1309,10 +1321,17 @@ static bool is_chained_work(struct workqueue_struct *wq)
  */
 static int wq_select_unbound_cpu(int cpu)
 {
+	static bool printed_dbg_warning;
 	int new_cpu;
 
-	if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
-		return cpu;
+	if (likely(!wq_debug_force_rr_cpu)) {
+		if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
+			return cpu;
+	} else if (!printed_dbg_warning) {
+		pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n");
+		printed_dbg_warning = true;
+	}
+
 	if (cpumask_empty(wq_unbound_cpumask))
 		return cpu;
 
-- 
cgit v1.1


From d6e022f1d207a161cd88e08ef0371554680ffc46 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 3 Feb 2016 13:54:25 -0500
Subject: workqueue: handle NUMA_NO_NODE for unbound pool_workqueue lookup

When looking up the pool_workqueue to use for an unbound workqueue,
workqueue assumes that the target CPU is always bound to a valid NUMA
node.  However, currently, when a CPU goes offline, the mapping is
destroyed and cpu_to_node() returns NUMA_NO_NODE.

This has always been broken but hasn't triggered often enough before
874bbfe600a6 ("workqueue: make sure delayed work run in local cpu").
After the commit, workqueue forcifully assigns the local CPU for
delayed work items without explicit target CPU to fix a different
issue.  This widens the window where CPU can go offline while a
delayed work item is pending causing delayed work items dispatched
with target CPU set to an already offlined CPU.  The resulting
NUMA_NO_NODE mapping makes workqueue try to queue the work item on a
NULL pool_workqueue and thus crash.

While 874bbfe600a6 has been reverted for a different reason making the
bug less visible again, it can still happen.  Fix it by mapping
NUMA_NO_NODE to the default pool_workqueue from unbound_pwq_by_node().
This is a temporary workaround.  The long term solution is keeping CPU
-> NODE mapping stable across CPU off/online cycles which is being
worked on.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Len Brown <len.brown@intel.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/g/1454424264.11183.46.camel@gmail.com
Link: http://lkml.kernel.org/g/1453702100-2597-1-git-send-email-tangchen@cn.fujitsu.com
---
 kernel/workqueue.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 51d77e7..7ff5dc7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -586,6 +586,16 @@ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
 						  int node)
 {
 	assert_rcu_or_wq_mutex_or_pool_mutex(wq);
+
+	/*
+	 * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
+	 * delayed item is pending.  The plan is to keep CPU -> NODE
+	 * mapping valid and stable across CPU on/offlines.  Once that
+	 * happens, this workaround can be removed.
+	 */
+	if (unlikely(node == NUMA_NO_NODE))
+		return wq->dfl_pwq;
+
 	return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
 }
 
-- 
cgit v1.1


From a1b14d27ed0965838350f1377ff97c93ee383492 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 10 Feb 2016 16:47:11 +0100
Subject: bpf: fix branch offset adjustment on backjumps after patching ctx
 expansion

When ctx access is used, the kernel often needs to expand/rewrite
instructions, so after that patching, branch offsets have to be
adjusted for both forward and backward jumps in the new eBPF program,
but for backward jumps it fails to account the delta. Meaning, for
example, if the expansion happens exactly on the insn that sits at
the jump target, it doesn't fix up the back jump offset.

Analysis on what the check in adjust_branches() is currently doing:

  /* adjust offset of jmps if necessary */
  if (i < pos && i + insn->off + 1 > pos)
    insn->off += delta;
  else if (i > pos && i + insn->off + 1 < pos)
    insn->off -= delta;

First condition (forward jumps):

  Before:                         After:

  insns[0]                        insns[0]
  insns[1] <--- i/insn            insns[1] <--- i/insn
  insns[2] <--- pos               insns[P] <--- pos
  insns[3]                        insns[P]  `------| delta
  insns[4] <--- target_X          insns[P]   `-----|
  insns[5]                        insns[3]
                                  insns[4] <--- target_X
                                  insns[5]

First case is if we cross pos-boundary and the jump instruction was
before pos. This is handeled correctly. I.e. if i == pos, then this
would mean our jump that we currently check was the patchlet itself
that we just injected. Since such patchlets are self-contained and
have no awareness of any insns before or after the patched one, the
delta is correctly not adjusted. Also, for the second condition in
case of i + insn->off + 1 == pos, means we jump to that newly patched
instruction, so no offset adjustment are needed. That part is correct.

Second condition (backward jumps):

  Before:                         After:

  insns[0]                        insns[0]
  insns[1] <--- target_X          insns[1] <--- target_X
  insns[2] <--- pos <-- target_Y  insns[P] <--- pos <-- target_Y
  insns[3]                        insns[P]  `------| delta
  insns[4] <--- i/insn            insns[P]   `-----|
  insns[5]                        insns[3]
                                  insns[4] <--- i/insn
                                  insns[5]

Second interesting case is where we cross pos-boundary and the jump
instruction was after pos. Backward jump with i == pos would be
impossible and pose a bug somewhere in the patchlet, so the first
condition checking i > pos is okay only by itself. However, i +
insn->off + 1 < pos does not always work as intended to trigger the
adjustment. It works when jump targets would be far off where the
delta wouldn't matter. But, for example, where the fixed insn->off
before pointed to pos (target_Y), it now points to pos + delta, so
that additional room needs to be taken into account for the check.
This means that i) both tests here need to be adjusted into pos + delta,
and ii) for the second condition, the test needs to be <= as pos
itself can be a target in the backjump, too.

Fixes: 9bac3d6d548e ("bpf: allow extended BPF programs access skb fields")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d1d3e8f..2e7f7ab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2082,7 +2082,7 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
 		/* adjust offset of jmps if necessary */
 		if (i < pos && i + insn->off + 1 > pos)
 			insn->off += delta;
-		else if (i > pos && i + insn->off + 1 < pos)
+		else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
 			insn->off -= delta;
 	}
 }
-- 
cgit v1.1


From 4a389810bc3cb0e73443104f0827e81e23cb1e12 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 11 Feb 2016 16:13:14 -0800
Subject: kernel/locking/lockdep.c: convert hash tables to hlists

Mike said:

: CONFIG_UBSAN_ALIGNMENT breaks x86-64 kernel with lockdep enabled, i.  e
: kernel with CONFIG_UBSAN_ALIGNMENT fails to load without even any error
: message.
:
: The problem is that ubsan callbacks use spinlocks and might be called
: before lockdep is initialized.  Particularly this line in the
: reserve_ebda_region function causes problem:
:
: lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
:
: If i put lockdep_init() before reserve_ebda_region call in
: x86_64_start_reservations kernel loads well.

Fix this ordering issue permanently: change lockdep so that it uses
hlists for the hash tables.  Unlike a list_head, an hlist_head is in its
initialized state when it is all-zeroes, so lockdep is ready for
operation immediately upon boot - lockdep_init() need not have run.

The patch will also save some memory.

lockdep_init() and lockdep_initialized can be done away with now - a 4.6
patch has been prepared to do this.

Reported-by: Mike Krinkin <krinkin.m.u@gmail.com>
Suggested-by: Mike Krinkin <krinkin.m.u@gmail.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/locking/lockdep.c | 42 +++++++++++++++++++-----------------------
 1 file changed, 19 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 60ace56..7537e56 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -292,7 +292,7 @@ LIST_HEAD(all_lock_classes);
 #define __classhashfn(key)	hash_long((unsigned long)key, CLASSHASH_BITS)
 #define classhashentry(key)	(classhash_table + __classhashfn((key)))
 
-static struct list_head classhash_table[CLASSHASH_SIZE];
+static struct hlist_head classhash_table[CLASSHASH_SIZE];
 
 /*
  * We put the lock dependency chains into a hash-table as well, to cache
@@ -303,7 +303,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE];
 #define __chainhashfn(chain)	hash_long(chain, CHAINHASH_BITS)
 #define chainhashentry(chain)	(chainhash_table + __chainhashfn((chain)))
 
-static struct list_head chainhash_table[CHAINHASH_SIZE];
+static struct hlist_head chainhash_table[CHAINHASH_SIZE];
 
 /*
  * The hash key of the lock dependency chains is a hash itself too:
@@ -666,7 +666,7 @@ static inline struct lock_class *
 look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 {
 	struct lockdep_subclass_key *key;
-	struct list_head *hash_head;
+	struct hlist_head *hash_head;
 	struct lock_class *class;
 
 #ifdef CONFIG_DEBUG_LOCKDEP
@@ -719,7 +719,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return NULL;
 
-	list_for_each_entry_rcu(class, hash_head, hash_entry) {
+	hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
 		if (class->key == key) {
 			/*
 			 * Huh! same key, different name? Did someone trample
@@ -742,7 +742,7 @@ static inline struct lock_class *
 register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 {
 	struct lockdep_subclass_key *key;
-	struct list_head *hash_head;
+	struct hlist_head *hash_head;
 	struct lock_class *class;
 
 	DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -774,7 +774,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	 * We have to do the hash-walk again, to avoid races
 	 * with another CPU:
 	 */
-	list_for_each_entry_rcu(class, hash_head, hash_entry) {
+	hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
 		if (class->key == key)
 			goto out_unlock_set;
 	}
@@ -805,7 +805,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	 * We use RCU's safe list-add method to make
 	 * parallel walking of the hash-list safe:
 	 */
-	list_add_tail_rcu(&class->hash_entry, hash_head);
+	hlist_add_head_rcu(&class->hash_entry, hash_head);
 	/*
 	 * Add it to the global list of classes:
 	 */
@@ -2017,7 +2017,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 				     u64 chain_key)
 {
 	struct lock_class *class = hlock_class(hlock);
-	struct list_head *hash_head = chainhashentry(chain_key);
+	struct hlist_head *hash_head = chainhashentry(chain_key);
 	struct lock_chain *chain;
 	struct held_lock *hlock_curr;
 	int i, j;
@@ -2033,7 +2033,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 	 * We can walk it lock-free, because entries only get added
 	 * to the hash:
 	 */
-	list_for_each_entry_rcu(chain, hash_head, entry) {
+	hlist_for_each_entry_rcu(chain, hash_head, entry) {
 		if (chain->chain_key == chain_key) {
 cache_hit:
 			debug_atomic_inc(chain_lookup_hits);
@@ -2057,7 +2057,7 @@ cache_hit:
 	/*
 	 * We have to walk the chain again locked - to avoid duplicates:
 	 */
-	list_for_each_entry(chain, hash_head, entry) {
+	hlist_for_each_entry(chain, hash_head, entry) {
 		if (chain->chain_key == chain_key) {
 			graph_unlock();
 			goto cache_hit;
@@ -2091,7 +2091,7 @@ cache_hit:
 		}
 		chain_hlocks[chain->base + j] = class - lock_classes;
 	}
-	list_add_tail_rcu(&chain->entry, hash_head);
+	hlist_add_head_rcu(&chain->entry, hash_head);
 	debug_atomic_inc(chain_lookup_misses);
 	inc_chains();
 
@@ -3875,7 +3875,7 @@ void lockdep_reset(void)
 	nr_process_chains = 0;
 	debug_locks = 1;
 	for (i = 0; i < CHAINHASH_SIZE; i++)
-		INIT_LIST_HEAD(chainhash_table + i);
+		INIT_HLIST_HEAD(chainhash_table + i);
 	raw_local_irq_restore(flags);
 }
 
@@ -3894,7 +3894,7 @@ static void zap_class(struct lock_class *class)
 	/*
 	 * Unhash the class and remove it from the all_lock_classes list:
 	 */
-	list_del_rcu(&class->hash_entry);
+	hlist_del_rcu(&class->hash_entry);
 	list_del_rcu(&class->lock_entry);
 
 	RCU_INIT_POINTER(class->key, NULL);
@@ -3917,7 +3917,7 @@ static inline int within(const void *addr, void *start, unsigned long size)
 void lockdep_free_key_range(void *start, unsigned long size)
 {
 	struct lock_class *class;
-	struct list_head *head;
+	struct hlist_head *head;
 	unsigned long flags;
 	int i;
 	int locked;
@@ -3930,9 +3930,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
 	 */
 	for (i = 0; i < CLASSHASH_SIZE; i++) {
 		head = classhash_table + i;
-		if (list_empty(head))
-			continue;
-		list_for_each_entry_rcu(class, head, hash_entry) {
+		hlist_for_each_entry_rcu(class, head, hash_entry) {
 			if (within(class->key, start, size))
 				zap_class(class);
 			else if (within(class->name, start, size))
@@ -3962,7 +3960,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
 void lockdep_reset_lock(struct lockdep_map *lock)
 {
 	struct lock_class *class;
-	struct list_head *head;
+	struct hlist_head *head;
 	unsigned long flags;
 	int i, j;
 	int locked;
@@ -3987,9 +3985,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
 	locked = graph_lock();
 	for (i = 0; i < CLASSHASH_SIZE; i++) {
 		head = classhash_table + i;
-		if (list_empty(head))
-			continue;
-		list_for_each_entry_rcu(class, head, hash_entry) {
+		hlist_for_each_entry_rcu(class, head, hash_entry) {
 			int match = 0;
 
 			for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
@@ -4027,10 +4023,10 @@ void lockdep_init(void)
 		return;
 
 	for (i = 0; i < CLASSHASH_SIZE; i++)
-		INIT_LIST_HEAD(classhash_table + i);
+		INIT_HLIST_HEAD(classhash_table + i);
 
 	for (i = 0; i < CHAINHASH_SIZE; i++)
-		INIT_LIST_HEAD(chainhash_table + i);
+		INIT_HLIST_HEAD(chainhash_table + i);
 
 	lockdep_initialized = 1;
 }
-- 
cgit v1.1


From db78c22230d0bcc8b27b81f05b39f104f08232c5 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 11 Feb 2016 16:13:17 -0800
Subject: mm: fix pfn_t vs highmem

The pfn_t type uses an unsigned long to store a pfn + flags value.  On a
64-bit platform the upper 12 bits of an unsigned long are never used for
storing the value of a pfn.  However, this is not true on highmem
platforms, all 32-bits of a pfn value are used to address a 44-bit
physical address space.  A pfn_t needs to store a 64-bit value.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=112211
Fixes: 01c8f1c44b83 ("mm, dax, gpu: convert vm_insert_mixed to pfn_t")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Stuart Foster <smf.linux@ntlworld.com>
Reported-by: Julian Margetson <runaway@candw.ms>
Tested-by: Julian Margetson <runaway@candw.ms>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 70ee377..2c468de 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -150,7 +150,7 @@ void devm_memunmap(struct device *dev, void *addr)
 }
 EXPORT_SYMBOL(devm_memunmap);
 
-pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags)
+pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
 {
 	return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
 }
-- 
cgit v1.1


From c219b7ddb6a3524a61a89438d86e4a8639706308 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 10 Feb 2016 12:04:22 -0500
Subject: sched/deadline: Fix trivial typo in printk() message

It's "too much" not "to much".

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Juri Lelli <juri.lelli@arm.com>
Cc: Jiri Kosina <trivial@kernel.org>
Cc: Juri Lelli <juri.lelli@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160210120422.4ca77e68@gandalf.local.home
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index cd64c97..57b939c 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -420,7 +420,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
 	 * entity.
 	 */
 	if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
-		printk_deferred_once("sched: DL replenish lagged to much\n");
+		printk_deferred_once("sched: DL replenish lagged too much\n");
 		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
 		dl_se->runtime = pi_se->dl_runtime;
 	}
-- 
cgit v1.1


From b4f75d44bed1bdbb14ac704bfc38f62a3675e591 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Feb 2016 20:11:20 +0000
Subject: perf/core: Remove bogus UP_CANCELED hotplug state

If CPU_UP_PREPARE fails the perf hotplug code calls perf_event_exit_cpu(),
which is a pointless exercise. The cpu is not online, so the smp function
calls return -ENXIO. So the result is a list walk to call noops.

Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/20160209201007.682184765@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5946460..474ffea 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9286,7 +9286,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 		perf_event_init_cpu(cpu);
 		break;
 
-	case CPU_UP_CANCELED:
 	case CPU_DOWN_PREPARE:
 		perf_event_exit_cpu(cpu);
 		break;
-- 
cgit v1.1


From 27ca9236c96f4a21b72a2b4f08260efeab951bd0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Feb 2016 20:11:26 +0000
Subject: perf/core: Remove the bogus and dangerous CPU_DOWN_FAILED hotplug
 state

If CPU_DOWN_PREPARE fails the perf hotplug notifier is called for
CPU_DOWN_FAILED and calls perf_event_init_cpu(), which checks whether the
swhash is referenced. If yes it allocates a new hash and stores the pointer in
the per cpu data structure.

But at this point the cpu is still online, so there must be a valid hash
already. By overwriting the pointer the existing hash is not longer
accessible.

Remove the CPU_DOWN_FAILED state, as there is nothing to (re)allocate.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/20160209201007.763417379@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 474ffea..4aa64a8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9282,7 +9282,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 	switch (action & ~CPU_TASKS_FROZEN) {
 
 	case CPU_UP_PREPARE:
-	case CPU_DOWN_FAILED:
 		perf_event_init_cpu(cpu);
 		break;
 
-- 
cgit v1.1


From 059fcd8cd16622da6513804a7a3e826d152c6c96 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Feb 2016 20:11:34 +0000
Subject: perf/core: Plug potential memory leak in CPU_UP_PREPARE

If CPU_UP_PREPARE is called it is not guaranteed, that a previously allocated
and assigned hash has been freed already, but perf_event_init_cpu()
unconditionally allocates and assignes a new hash if the swhash is referenced.
By overwriting the pointer the existing hash is not longer accessible.

Verify that there is no hash assigned on this cpu before allocating and
assigning a new one.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/20160209201007.843269966@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4aa64a8..0d58522 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9206,7 +9206,7 @@ static void perf_event_init_cpu(int cpu)
 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
 	mutex_lock(&swhash->hlist_mutex);
-	if (swhash->hlist_refcount > 0) {
+	if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
 		struct swevent_hlist *hlist;
 
 		hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
-- 
cgit v1.1


From 7dcd182bec271ab341b05b66b6006995795fc0e7 Mon Sep 17 00:00:00 2001
From: Jessica Yu <jeyu@redhat.com>
Date: Tue, 16 Feb 2016 17:32:33 -0500
Subject: ftrace/module: remove ftrace module notifier

Remove the ftrace module notifier in favor of directly calling
ftrace_module_enable() and ftrace_release_mod() in the module loader.
Hard-coding the function calls directly in the module loader removes
dependence on the module notifier call chain and provides better
visibility and control over what gets called when, which is important
to kernel utilities such as livepatch.

This fixes a notifier ordering issue in which the ftrace module notifier
(and hence ftrace_module_enable()) for coming modules was being called
after klp_module_notify(), which caused livepatch modules to initialize
incorrectly. This patch removes dependence on the module notifier call
chain in favor of hard coding the corresponding function calls in the
module loader. This ensures that ftrace and livepatch code get called in
the correct order on patch module load and unload.

Fixes: 5156dca34a3e ("ftrace: Fix the race between ftrace and insmod")
Signed-off-by: Jessica Yu <jeyu@redhat.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Petr Mladek <pmladek@suse.cz>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/module.c       |  4 ++++
 kernel/trace/ftrace.c | 36 +-----------------------------------
 2 files changed, 5 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 8358f46..b05d466 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -981,6 +981,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		mod->exit();
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_GOING, mod);
+	ftrace_release_mod(mod);
+
 	async_synchronize_full();
 
 	/* Store the name of the last unloaded module for diagnostic purposes */
@@ -3295,6 +3297,7 @@ fail:
 	module_put(mod);
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_GOING, mod);
+	ftrace_release_mod(mod);
 	free_module(mod);
 	wake_up_all(&module_wq);
 	return ret;
@@ -3371,6 +3374,7 @@ static int complete_formation(struct module *mod, struct load_info *info)
 	mod->state = MODULE_STATE_COMING;
 	mutex_unlock(&module_mutex);
 
+	ftrace_module_enable(mod);
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_COMING, mod);
 	return 0;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eca592f..57a6eea 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4961,7 +4961,7 @@ void ftrace_release_mod(struct module *mod)
 	mutex_unlock(&ftrace_lock);
 }
 
-static void ftrace_module_enable(struct module *mod)
+void ftrace_module_enable(struct module *mod)
 {
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
@@ -5038,38 +5038,8 @@ void ftrace_module_init(struct module *mod)
 	ftrace_process_locs(mod, mod->ftrace_callsites,
 			    mod->ftrace_callsites + mod->num_ftrace_callsites);
 }
-
-static int ftrace_module_notify(struct notifier_block *self,
-				unsigned long val, void *data)
-{
-	struct module *mod = data;
-
-	switch (val) {
-	case MODULE_STATE_COMING:
-		ftrace_module_enable(mod);
-		break;
-	case MODULE_STATE_GOING:
-		ftrace_release_mod(mod);
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-#else
-static int ftrace_module_notify(struct notifier_block *self,
-				unsigned long val, void *data)
-{
-	return 0;
-}
 #endif /* CONFIG_MODULES */
 
-struct notifier_block ftrace_module_nb = {
-	.notifier_call = ftrace_module_notify,
-	.priority = INT_MIN,	/* Run after anything that can remove kprobes */
-};
-
 void __init ftrace_init(void)
 {
 	extern unsigned long __start_mcount_loc[];
@@ -5098,10 +5068,6 @@ void __init ftrace_init(void)
 				  __start_mcount_loc,
 				  __stop_mcount_loc);
 
-	ret = register_module_notifier(&ftrace_module_nb);
-	if (ret)
-		pr_warning("Failed to register trace ftrace module exit notifier\n");
-
 	set_ftrace_early_filters();
 
 	return;
-- 
cgit v1.1


From 9273a8bbf58a15051e53a777389a502420ddc60e Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Wed, 17 Feb 2016 13:11:29 -0800
Subject: devm_memremap_release(): fix memremap'd addr handling

The pmem driver calls devm_memremap() to map a persistent memory range.
When the pmem driver is unloaded, this memremap'd range is not released
so the kernel will leak a vma.

Fix devm_memremap_release() to handle a given memremap'd address
properly.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 2c468de..7a1b5c3 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -114,7 +114,7 @@ EXPORT_SYMBOL(memunmap);
 
 static void devm_memremap_release(struct device *dev, void *res)
 {
-	memunmap(res);
+	memunmap(*(void **)res);
 }
 
 static int devm_memremap_match(struct device *dev, void *res, void *match_data)
-- 
cgit v1.1


From 6e22c8366416251a3d88ba6c92d13d595089f0ed Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Fri, 12 Feb 2016 12:46:00 -0800
Subject: tracing, kasan: Silence Kasan warning in check_stack of stack_tracer

When enabling stack trace via "echo 1 > /proc/sys/kernel/stack_tracer_enabled",
the below KASAN warning is triggered:

BUG: KASAN: stack-out-of-bounds in check_stack+0x344/0x848 at addr ffffffc0689ebab8
Read of size 8 by task ksoftirqd/4/29
page:ffffffbdc3a27ac0 count:0 mapcount:0 mapping:          (null) index:0x0
flags: 0x0()
page dumped because: kasan: bad access detected
CPU: 4 PID: 29 Comm: ksoftirqd/4 Not tainted 4.5.0-rc1 #129
Hardware name: Freescale Layerscape 2085a RDB Board (DT)
Call trace:
[<ffffffc000091300>] dump_backtrace+0x0/0x3a0
[<ffffffc0000916c4>] show_stack+0x24/0x30
[<ffffffc0009bbd78>] dump_stack+0xd8/0x168
[<ffffffc000420bb0>] kasan_report_error+0x6a0/0x920
[<ffffffc000421688>] kasan_report+0x70/0xb8
[<ffffffc00041f7f0>] __asan_load8+0x60/0x78
[<ffffffc0002e05c4>] check_stack+0x344/0x848
[<ffffffc0002e0c8c>] stack_trace_call+0x1c4/0x370
[<ffffffc0002af558>] ftrace_ops_no_ops+0x2c0/0x590
[<ffffffc00009f25c>] ftrace_graph_call+0x0/0x14
[<ffffffc0000881bc>] fpsimd_thread_switch+0x24/0x1e8
[<ffffffc000089864>] __switch_to+0x34/0x218
[<ffffffc0011e089c>] __schedule+0x3ac/0x15b8
[<ffffffc0011e1f6c>] schedule+0x5c/0x178
[<ffffffc0001632a8>] smpboot_thread_fn+0x350/0x960
[<ffffffc00015b518>] kthread+0x1d8/0x2b0
[<ffffffc0000874d0>] ret_from_fork+0x10/0x40
Memory state around the buggy address:
 ffffffc0689eb980: 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 f4 f4 f4
 ffffffc0689eba00: f3 f3 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00
>ffffffc0689eba80: 00 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00
                                        ^
 ffffffc0689ebb00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 ffffffc0689ebb80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00

The stacker tracer traverses the whole kernel stack when saving the max stack
trace. It may touch the stack red zones to cause the warning. So, just disable
the instrumentation to silence the warning.

Link: http://lkml.kernel.org/r/1455309960-18930-1-git-send-email-yang.shi@linaro.org

Signed-off-by: Yang Shi <yang.shi@linaro.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_stack.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 202df6c..2a1abba 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -156,7 +156,11 @@ check_stack(unsigned long ip, unsigned long *stack)
 		for (; p < top && i < stack_trace_max.nr_entries; p++) {
 			if (stack_dump_trace[i] == ULONG_MAX)
 				break;
-			if (*p == stack_dump_trace[i]) {
+			/*
+			 * The READ_ONCE_NOCHECK is used to let KASAN know that
+			 * this is not a stack-out-of-bounds error.
+			 */
+			if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) {
 				stack_dump_trace[x] = stack_dump_trace[i++];
 				this_size = stack_trace_index[x++] =
 					(top - p) * sizeof(unsigned long);
-- 
cgit v1.1


From 59ceeaaf355fa0fb16558ef7c24413c804932ada Mon Sep 17 00:00:00 2001
From: Simon Guinot <simon.guinot@sequanux.org>
Date: Thu, 10 Sep 2015 00:15:18 +0200
Subject: kernel/resource.c: fix muxed resource handling in __request_region()

In __request_region, if a conflict with a BUSY and MUXED resource is
detected, then the caller goes to sleep and waits for the resource to be
released.  A pointer on the conflicting resource is kept.  At wake-up
this pointer is used as a parent to retry to request the region.

A first problem is that this pointer might well be invalid (if for
example the conflicting resource have already been freed).  Another
problem is that the next call to __request_region() fails to detect a
remaining conflict.  The previously conflicting resource is passed as a
parameter and __request_region() will look for a conflict among the
children of this resource and not at the resource itself.  It is likely
to succeed anyway, even if there is still a conflict.

Instead, the parent of the conflicting resource should be passed to
__request_region().

As a fix, this patch doesn't update the parent resource pointer in the
case we have to wait for a muxed region right after.

Reported-and-tested-by: Vincent Pelletier <plr.vincent@gmail.com>
Signed-off-by: Simon Guinot <simon.guinot@sequanux.org>
Tested-by: Vincent Donnefort <vdonnefort@gmail.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/resource.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 09c0597..3669d1b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1083,9 +1083,10 @@ struct resource * __request_region(struct resource *parent,
 		if (!conflict)
 			break;
 		if (conflict != parent) {
-			parent = conflict;
-			if (!(conflict->flags & IORESOURCE_BUSY))
+			if (!(conflict->flags & IORESOURCE_BUSY)) {
+				parent = conflict;
 				continue;
+			}
 		}
 		if (conflict->flags & flags & IORESOURCE_MUXED) {
 			add_wait_queue(&muxed_resource_wait, &wait);
-- 
cgit v1.1


From 93f834df9c2d4e362dfdc4b05daa0a4e18814836 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Sat, 20 Feb 2016 14:32:24 -0800
Subject: devm_memremap: Fix error value when memremap failed

devm_memremap() returns an ERR_PTR() value in case of error.
However, it returns NULL when memremap() failed.  This causes
the caller, such as the pmem driver, to proceed and oops later.

Change devm_memremap() to return ERR_PTR(-ENXIO) when memremap()
failed.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <stable@vger.kernel.org>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 kernel/memremap.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 2c468de..b04ea2f 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -136,8 +136,10 @@ void *devm_memremap(struct device *dev, resource_size_t offset,
 	if (addr) {
 		*ptr = addr;
 		devres_add(dev, ptr);
-	} else
+	} else {
 		devres_free(ptr);
+		return ERR_PTR(-ENXIO);
+	}
 
 	return addr;
 }
-- 
cgit v1.1


From d045437a169f899dfb0f6f7ede24cc042543ced9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 24 Feb 2016 09:04:24 -0500
Subject: tracing: Fix showing function event in available_events

The ftrace:function event is only displayed for parsing the function tracer
data. It is not used to enable function tracing, and does not include an
"enable" file in its event directory.

Originally, this event was kept separate from other events because it did
not have a ->reg parameter. But perf added a "reg" parameter for its use
which caused issues, because it made the event available to functions where
it was not compatible for.

Commit 9b63776fa3ca9 "tracing: Do not enable function event with enable"
added a TRACE_EVENT_FL_IGNORE_ENABLE flag that prevented the function event
from being enabled by normal trace events. But this commit missed keeping
the function event from being displayed by the "available_events" directory,
which is used to show what events can be enabled by set_event.

One documented way to enable all events is to:

 cat available_events > set_event

But because the function event is displayed in the available_events, this
now causes an INVALID error:

 cat: write error: Invalid argument

Reported-by: Chunyu Hu <chuhu@redhat.com>
Fixes: 9b63776fa3ca9 "tracing: Do not enable function event with enable"
Cc: stable@vger.kernel.org # 3.4+
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f333e57..ab09829 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -869,7 +869,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		 * The ftrace subsystem is for showing formats only.
 		 * They can not be enabled or disabled via the event files.
 		 */
-		if (call->class && call->class->reg)
+		if (call->class && call->class->reg &&
+		    !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
 			return file;
 	}
 
-- 
cgit v1.1


From 84c4e620d35f49f486a900af214ad12276afb386 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:40 +0100
Subject: perf: Close install vs. exit race

Consider the following scenario:

  CPU0					CPU1

  ctx = find_get_ctx();
					perf_event_exit_task_context()
  mutex_lock(&ctx->mutex);
  perf_install_in_context(ctx, ...);
    /* NO-OP */
  mutex_unlock(&ctx->mutex);

  ...

  perf_release()
    WARN_ON_ONCE(event->state != STATE_EXIT);

Since the event doesn't pass through perf_remove_from_context()
because perf_install_in_context() NO-OPs because the ctx is dead, and
perf_event_exit_task_context() will not observe the event because its
not attached yet, the event->state will not be set.

Solve this by revalidating ctx->task after we acquire ctx->mutex and
failing the event creation as a whole.

Tested-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174947.626853419@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0d58522..d7b0316 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2158,13 +2158,15 @@ perf_install_in_context(struct perf_event_context *ctx,
 	 */
 	raw_spin_lock_irq(&ctx->lock);
 	task = ctx->task;
+
 	/*
-	 * Worse, we cannot even rely on the ctx actually existing anymore. If
-	 * between find_get_context() and perf_install_in_context() the task
-	 * went through perf_event_exit_task() its dead and we should not be
-	 * adding new events.
+	 * If between ctx = find_get_context() and mutex_lock(&ctx->mutex) the
+	 * ctx gets destroyed, we must not install an event into it.
+	 *
+	 * This is normally tested for after we acquire the mutex, so this is
+	 * a sanity check.
 	 */
-	if (task == TASK_TOMBSTONE) {
+	if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
 		raw_spin_unlock_irq(&ctx->lock);
 		return;
 	}
@@ -8389,10 +8391,19 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (move_group) {
 		gctx = group_leader->ctx;
 		mutex_lock_double(&gctx->mutex, &ctx->mutex);
+		if (gctx->task == TASK_TOMBSTONE) {
+			err = -ESRCH;
+			goto err_locked;
+		}
 	} else {
 		mutex_lock(&ctx->mutex);
 	}
 
+	if (ctx->task == TASK_TOMBSTONE) {
+		err = -ESRCH;
+		goto err_locked;
+	}
+
 	if (!perf_event_validate_size(event)) {
 		err = -E2BIG;
 		goto err_locked;
@@ -8563,12 +8574,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
 	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
+	if (ctx->task == TASK_TOMBSTONE) {
+		err = -ESRCH;
+		goto err_unlock;
+	}
+
 	if (!exclusive_event_installable(event, ctx)) {
-		mutex_unlock(&ctx->mutex);
-		perf_unpin_context(ctx);
-		put_ctx(ctx);
 		err = -EBUSY;
-		goto err_free;
+		goto err_unlock;
 	}
 
 	perf_install_in_context(ctx, event, cpu);
@@ -8577,6 +8590,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
 	return event;
 
+err_unlock:
+	mutex_unlock(&ctx->mutex);
+	perf_unpin_context(ctx);
+	put_ctx(ctx);
 err_free:
 	free_event(event);
 err:
-- 
cgit v1.1


From 130056275ade730e7a79c110212c8815202773ee Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:41 +0100
Subject: perf: Do not double free

In case of: err_file: fput(event_file), we'll end up calling
perf_release() which in turn will free the event.

Do not then free the event _again_.

Tested-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174947.697350349@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d7b0316..211a5ce 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8520,7 +8520,12 @@ err_context:
 	perf_unpin_context(ctx);
 	put_ctx(ctx);
 err_alloc:
-	free_event(event);
+	/*
+	 * If event_file is set, the fput() above will have called ->release()
+	 * and that will take care of freeing the event.
+	 */
+	if (!event_file)
+		free_event(event);
 err_cpus:
 	put_online_cpus();
 err_task:
-- 
cgit v1.1


From a4f4bb6d0c69d0bb573f1d9e6f1b806f9b038b19 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:42 +0100
Subject: perf: Allow perf_release() with !event->ctx

In the err_file: fput(event_file) case, the event will not yet have
been attached to a context. However perf_release() does assume it has
been. Cure this.

Tested-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174947.793996260@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 211a5ce..d5299e2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3754,9 +3754,19 @@ static void put_event(struct perf_event *event)
  */
 int perf_event_release_kernel(struct perf_event *event)
 {
-	struct perf_event_context *ctx;
+	struct perf_event_context *ctx = event->ctx;
 	struct perf_event *child, *tmp;
 
+	/*
+	 * If we got here through err_file: fput(event_file); we will not have
+	 * attached to a context yet.
+	 */
+	if (!ctx) {
+		WARN_ON_ONCE(event->attach_state &
+				(PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
+		goto no_ctx;
+	}
+
 	if (!is_kernel_event(event))
 		perf_remove_from_owner(event);
 
@@ -3832,8 +3842,8 @@ again:
 	}
 	mutex_unlock(&event->child_mutex);
 
-	/* Must be the last reference */
-	put_event(event);
+no_ctx:
+	put_event(event); /* Must be the 'last' reference */
 	return 0;
 }
 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
-- 
cgit v1.1


From 6f932e5be1503ab0783699e843db325d44c2fabb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:43 +0100
Subject: perf: Only update context time when active

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174947.860690919@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d5299e2..64698fb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2170,12 +2170,12 @@ perf_install_in_context(struct perf_event_context *ctx,
 		raw_spin_unlock_irq(&ctx->lock);
 		return;
 	}
-	update_context_time(ctx);
-	/*
-	 * Update cgrp time only if current cgrp matches event->cgrp.
-	 * Must be done before calling add_event_to_ctx().
-	 */
-	update_cgrp_time_from_event(event);
+
+	if (ctx->is_active) {
+		update_context_time(ctx);
+		update_cgrp_time_from_event(event);
+	}
+
 	add_event_to_ctx(event, ctx);
 	raw_spin_unlock_irq(&ctx->lock);
 
-- 
cgit v1.1


From a69b0ca4ac3bf5427b571f11cbf33f0a32b728d5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:44 +0100
Subject: perf: Fix cloning

Alexander reported that when the 'original' context gets destroyed, no
new clones happen.

This can happen irrespective of the ctx switch optimization, any task
can die, even the parent, and we want to continue monitoring the task
hierarchy until we either close the event or no tasks are left in the
hierarchy.

perf_event_init_context() will attempt to pin the 'parent' context
during clone(). At that point current is the parent, and since current
cannot have exited while executing clone(), its context cannot have
passed through perf_event_exit_task_context(). Therefore
perf_pin_task_context() cannot observe ctx->task == TASK_TOMBSTONE.

However, since inherit_event() does:

	if (parent_event->parent)
		parent_event = parent_event->parent;

it looks at the 'original' event when it does: is_orphaned_event().
This can return true if the context that contains the this event has
passed through perf_event_exit_task_context(). And thus we'll fail to
clone the perf context.

Fix this by adding a new state: STATE_DEAD, which is set by
perf_release() to indicate that the filedesc (or kernel reference) is
dead and there are no observers for our data left.

Only for STATE_DEAD will is_orphaned_event() be true and inhibit
cloning.

STATE_EXIT is otherwise preserved such that is_event_hup() remains
functional and will report when the observed task hierarchy becomes
empty.

Reported-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Tested-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Fixes: c6e5b73242d2 ("perf: Synchronously clean up child events")
Link: http://lkml.kernel.org/r/20160224174947.919845295@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 64698fb..92d6999 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1645,7 +1645,7 @@ out:
 
 static bool is_orphaned_event(struct perf_event *event)
 {
-	return event->state == PERF_EVENT_STATE_EXIT;
+	return event->state == PERF_EVENT_STATE_DEAD;
 }
 
 static inline int pmu_filter_match(struct perf_event *event)
@@ -1732,7 +1732,6 @@ group_sched_out(struct perf_event *group_event,
 }
 
 #define DETACH_GROUP	0x01UL
-#define DETACH_STATE	0x02UL
 
 /*
  * Cross CPU call to remove a performance event
@@ -1752,8 +1751,6 @@ __perf_remove_from_context(struct perf_event *event,
 	if (flags & DETACH_GROUP)
 		perf_group_detach(event);
 	list_del_event(event, ctx);
-	if (flags & DETACH_STATE)
-		event->state = PERF_EVENT_STATE_EXIT;
 
 	if (!ctx->nr_events && ctx->is_active) {
 		ctx->is_active = 0;
@@ -3772,22 +3769,24 @@ int perf_event_release_kernel(struct perf_event *event)
 
 	ctx = perf_event_ctx_lock(event);
 	WARN_ON_ONCE(ctx->parent_ctx);
-	perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE);
-	perf_event_ctx_unlock(event, ctx);
+	perf_remove_from_context(event, DETACH_GROUP);
 
+	raw_spin_lock_irq(&ctx->lock);
 	/*
-	 * At this point we must have event->state == PERF_EVENT_STATE_EXIT,
-	 * either from the above perf_remove_from_context() or through
-	 * perf_event_exit_event().
+	 * Mark this even as STATE_DEAD, there is no external reference to it
+	 * anymore.
 	 *
-	 * Therefore, anybody acquiring event->child_mutex after the below
-	 * loop _must_ also see this, most importantly inherit_event() which
-	 * will avoid placing more children on the list.
+	 * Anybody acquiring event->child_mutex after the below loop _must_
+	 * also see this, most importantly inherit_event() which will avoid
+	 * placing more children on the list.
 	 *
 	 * Thus this guarantees that we will in fact observe and kill _ALL_
 	 * child events.
 	 */
-	WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT);
+	event->state = PERF_EVENT_STATE_DEAD;
+	raw_spin_unlock_irq(&ctx->lock);
+
+	perf_event_ctx_unlock(event, ctx);
 
 again:
 	mutex_lock(&event->child_mutex);
@@ -4000,7 +3999,7 @@ static bool is_event_hup(struct perf_event *event)
 {
 	bool no_children;
 
-	if (event->state != PERF_EVENT_STATE_EXIT)
+	if (event->state > PERF_EVENT_STATE_EXIT)
 		return false;
 
 	mutex_lock(&event->child_mutex);
@@ -8727,7 +8726,7 @@ perf_event_exit_event(struct perf_event *child_event,
 	if (parent_event)
 		perf_group_detach(child_event);
 	list_del_event(child_event, child_ctx);
-	child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */
+	child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
 	raw_spin_unlock_irq(&child_ctx->lock);
 
 	/*
-- 
cgit v1.1


From 9107c89e269d2738019861bb518e3d59bef01781 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:45 +0100
Subject: perf: Fix race between event install and jump_labels

perf_install_in_context() relies upon the context switch hooks to have
scheduled in events when the IPI misses its target -- after all, if
the task has moved from the CPU (or wasn't running at all), it will
have to context switch to run elsewhere.

This however doesn't appear to be happening.

It is possible for the IPI to not happen (task wasn't running) only to
later observe the task running with an inactive context.

The only possible explanation is that the context switch hooks are not
called. Therefore put in a sync_sched() after toggling the jump_label
to guarantee all CPUs will have them enabled before we install an
event.

A simple if (0->1) sync_sched() will not in fact work, because any
further increment can race and complete before the sync_sched().
Therefore we must jump through some hoops.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174947.980211985@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 49 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 41 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 92d6999..ea064ca 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -321,7 +321,13 @@ enum event_type_t {
  * perf_sched_events : >0 events exist
  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
  */
-struct static_key_deferred perf_sched_events __read_mostly;
+
+static void perf_sched_delayed(struct work_struct *work);
+DEFINE_STATIC_KEY_FALSE(perf_sched_events);
+static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
+static DEFINE_MUTEX(perf_sched_mutex);
+static atomic_t perf_sched_count;
+
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 
@@ -3536,12 +3542,22 @@ static void unaccount_event(struct perf_event *event)
 	if (has_branch_stack(event))
 		dec = true;
 
-	if (dec)
-		static_key_slow_dec_deferred(&perf_sched_events);
+	if (dec) {
+		if (!atomic_add_unless(&perf_sched_count, -1, 1))
+			schedule_delayed_work(&perf_sched_work, HZ);
+	}
 
 	unaccount_event_cpu(event, event->cpu);
 }
 
+static void perf_sched_delayed(struct work_struct *work)
+{
+	mutex_lock(&perf_sched_mutex);
+	if (atomic_dec_and_test(&perf_sched_count))
+		static_branch_disable(&perf_sched_events);
+	mutex_unlock(&perf_sched_mutex);
+}
+
 /*
  * The following implement mutual exclusion of events on "exclusive" pmus
  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
@@ -7780,8 +7796,28 @@ static void account_event(struct perf_event *event)
 	if (is_cgroup_event(event))
 		inc = true;
 
-	if (inc)
-		static_key_slow_inc(&perf_sched_events.key);
+	if (inc) {
+		if (atomic_inc_not_zero(&perf_sched_count))
+			goto enabled;
+
+		mutex_lock(&perf_sched_mutex);
+		if (!atomic_read(&perf_sched_count)) {
+			static_branch_enable(&perf_sched_events);
+			/*
+			 * Guarantee that all CPUs observe they key change and
+			 * call the perf scheduling hooks before proceeding to
+			 * install events that need them.
+			 */
+			synchronize_sched();
+		}
+		/*
+		 * Now that we have waited for the sync_sched(), allow further
+		 * increments to by-pass the mutex.
+		 */
+		atomic_inc(&perf_sched_count);
+		mutex_unlock(&perf_sched_mutex);
+	}
+enabled:
 
 	account_event_cpu(event, event->cpu);
 }
@@ -9344,9 +9380,6 @@ void __init perf_event_init(void)
 	ret = init_hw_breakpoint();
 	WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
 
-	/* do not patch jump label more than once per second */
-	jump_label_rate_limit(&perf_sched_events, HZ);
-
 	/*
 	 * Build time assertion that we keep the data_head at the intended
 	 * location.  IOW, validation we got the __reserved[] size right.
-- 
cgit v1.1


From 28a967c3a2f99fa3b5f762f25cb2a319d933571b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:46 +0100
Subject: perf: Cure event->pending_disable race

Because event_sched_out() checks event->pending_disable _before_
actually disabling the event, it can happen that the event fires after
it checks but before it gets disabled.

This would leave event->pending_disable set and the queued irq_work
will try and process it.

However, if the event trigger was during schedule(), the event might
have been de-scheduled by the time the irq_work runs, and
perf_event_disable_local() will fail.

Fix this by checking event->pending_disable _after_ we call
event->pmu->del(). This depends on the latter being a compiler
barrier, such that the compiler does not lift the load and re-creates
the problem.

Tested-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174948.040469884@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index ea064ca..de14b67 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1696,14 +1696,14 @@ event_sched_out(struct perf_event *event,
 
 	perf_pmu_disable(event->pmu);
 
+	event->tstamp_stopped = tstamp;
+	event->pmu->del(event, 0);
+	event->oncpu = -1;
 	event->state = PERF_EVENT_STATE_INACTIVE;
 	if (event->pending_disable) {
 		event->pending_disable = 0;
 		event->state = PERF_EVENT_STATE_OFF;
 	}
-	event->tstamp_stopped = tstamp;
-	event->pmu->del(event, 0);
-	event->oncpu = -1;
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu--;
-- 
cgit v1.1


From 3cbaa59069677920186dcf502632ca1df4329f80 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:47 +0100
Subject: perf: Fix ctx time tracking by introducing EVENT_TIME

Currently any ctx_sched_in() call will re-start the ctx time tracking,
this means that calls like:

	ctx_sched_in(.event_type = EVENT_PINNED);
	ctx_sched_in(.event_type = EVENT_FLEXIBLE);

will have a hole in their ctx time tracking. This is likely harmless
but can confuse things a little. By adding EVENT_TIME, we can have the
first ctx_sched_in() (is_active: 0 -> !0) start the time and any
further ctx_sched_in() will leave the timestamps alone.

Secondly, this allows for an early disable like:

	ctx_sched_out(.event_type = EVENT_TIME);

which would update the ctx time (if the ctx is active) and any further
calls to ctx_sched_out() would not further modify the ctx time.

For ctx_sched_in() any 0 -> !0 transition will automatically include
EVENT_TIME.

For ctx_sched_out(), any transition that clears EVENT_ALL will
automatically clear EVENT_TIME.

These two rules ensure that under normal circumstances we need not
bother with EVENT_TIME and get natural ctx time behaviour.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174948.100446561@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 42 ++++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index de14b67..75bde93 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -314,6 +314,7 @@ again:
 enum event_type_t {
 	EVENT_FLEXIBLE = 0x1,
 	EVENT_PINNED = 0x2,
+	EVENT_TIME = 0x4,
 	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
 
@@ -1294,16 +1295,18 @@ static u64 perf_event_time(struct perf_event *event)
 
 /*
  * Update the total_time_enabled and total_time_running fields for a event.
- * The caller of this function needs to hold the ctx->lock.
  */
 static void update_event_times(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
 	u64 run_end;
 
+	lockdep_assert_held(&ctx->lock);
+
 	if (event->state < PERF_EVENT_STATE_INACTIVE ||
 	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 		return;
+
 	/*
 	 * in cgroup mode, time_enabled represents
 	 * the time the event was enabled AND active
@@ -2349,24 +2352,33 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 	}
 
 	ctx->is_active &= ~event_type;
+	if (!(ctx->is_active & EVENT_ALL))
+		ctx->is_active = 0;
+
 	if (ctx->task) {
 		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
 		if (!ctx->is_active)
 			cpuctx->task_ctx = NULL;
 	}
 
-	update_context_time(ctx);
-	update_cgrp_time_from_cpuctx(cpuctx);
-	if (!ctx->nr_active)
+	is_active ^= ctx->is_active; /* changed bits */
+
+	if (is_active & EVENT_TIME) {
+		/* update (and stop) ctx time */
+		update_context_time(ctx);
+		update_cgrp_time_from_cpuctx(cpuctx);
+	}
+
+	if (!ctx->nr_active || !(is_active & EVENT_ALL))
 		return;
 
 	perf_pmu_disable(ctx->pmu);
-	if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
+	if (is_active & EVENT_PINNED) {
 		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
 			group_sched_out(event, cpuctx, ctx);
 	}
 
-	if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
+	if (is_active & EVENT_FLEXIBLE) {
 		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
 			group_sched_out(event, cpuctx, ctx);
 	}
@@ -2740,7 +2752,7 @@ ctx_sched_in(struct perf_event_context *ctx,
 	if (likely(!ctx->nr_events))
 		return;
 
-	ctx->is_active |= event_type;
+	ctx->is_active |= (event_type | EVENT_TIME);
 	if (ctx->task) {
 		if (!is_active)
 			cpuctx->task_ctx = ctx;
@@ -2748,18 +2760,24 @@ ctx_sched_in(struct perf_event_context *ctx,
 			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
 	}
 
-	now = perf_clock();
-	ctx->timestamp = now;
-	perf_cgroup_set_timestamp(task, ctx);
+	is_active ^= ctx->is_active; /* changed bits */
+
+	if (is_active & EVENT_TIME) {
+		/* start ctx time */
+		now = perf_clock();
+		ctx->timestamp = now;
+		perf_cgroup_set_timestamp(task, ctx);
+	}
+
 	/*
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
 	 */
-	if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
+	if (is_active & EVENT_PINNED)
 		ctx_pinned_sched_in(ctx, cpuctx);
 
 	/* Then walk through the lower prio flexible groups */
-	if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
+	if (is_active & EVENT_FLEXIBLE)
 		ctx_flexible_sched_in(ctx, cpuctx);
 }
 
-- 
cgit v1.1


From 7fce250915efca0f8f51dddee3ae89bf30d86ca5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:48 +0100
Subject: perf: Fix scaling vs. perf_event_enable_on_exec()

The recent commit 3e349507d12d ("perf: Fix perf_enable_on_exec() event
scheduling") caused this by moving task_ctx_sched_out() from before
__perf_event_mask_enable() to after it.

The overlooked consequence of that change is that task_ctx_sched_out()
would update the ctx time fields, and now __perf_event_mask_enable()
uses stale time.

In order to fix this, explicitly stop our context's time before
enabling the event(s).

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Fixes: 3e349507d12d ("perf: Fix perf_enable_on_exec() event scheduling")
Link: http://lkml.kernel.org/r/20160224174948.159242158@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 75bde93..d003088 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3143,6 +3143,7 @@ static void perf_event_enable_on_exec(int ctxn)
 
 	cpuctx = __get_cpu_context(ctx);
 	perf_ctx_lock(cpuctx, ctx);
+	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 	list_for_each_entry(event, &ctx->event_list, event_entry)
 		enabled |= event_enable_on_exec(event, ctx);
 
-- 
cgit v1.1


From bd2afa49d194c6412c333e9fdd48bc5d06bb465d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:49 +0100
Subject: perf: Fix scaling vs. perf_event_enable()

Similar to the perf_enable_on_exec(), ensure that event timings are
consistent across perf_event_enable().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174948.218288698@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 42 +++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d003088..57c25fa 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2069,14 +2069,27 @@ static void add_event_to_ctx(struct perf_event *event,
 	event->tstamp_stopped = tstamp;
 }
 
-static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
-			       struct perf_event_context *ctx);
+static void ctx_sched_out(struct perf_event_context *ctx,
+			  struct perf_cpu_context *cpuctx,
+			  enum event_type_t event_type);
 static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
 	     enum event_type_t event_type,
 	     struct task_struct *task);
 
+static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
+			       struct perf_event_context *ctx)
+{
+	if (!cpuctx->task_ctx)
+		return;
+
+	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+		return;
+
+	ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+}
+
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
 				struct perf_event_context *ctx,
 				struct task_struct *task)
@@ -2227,17 +2240,18 @@ static void __perf_event_enable(struct perf_event *event,
 	    event->state <= PERF_EVENT_STATE_ERROR)
 		return;
 
-	update_context_time(ctx);
+	if (ctx->is_active)
+		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+
 	__perf_event_mark_enabled(event);
 
 	if (!ctx->is_active)
 		return;
 
 	if (!event_filter_match(event)) {
-		if (is_cgroup_event(event)) {
-			perf_cgroup_set_timestamp(current, ctx); // XXX ?
+		if (is_cgroup_event(event))
 			perf_cgroup_defer_enabled(event);
-		}
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
 		return;
 	}
 
@@ -2245,8 +2259,10 @@ static void __perf_event_enable(struct perf_event *event,
 	 * If the event is in a group and isn't the group leader,
 	 * then don't put it on unless the group is on.
 	 */
-	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
+	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
 		return;
+	}
 
 	task_ctx = cpuctx->task_ctx;
 	if (ctx->task)
@@ -2658,18 +2674,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
 		perf_cgroup_sched_out(task, next);
 }
 
-static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
-			       struct perf_event_context *ctx)
-{
-	if (!cpuctx->task_ctx)
-		return;
-
-	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
-		return;
-
-	ctx_sched_out(ctx, cpuctx, EVENT_ALL);
-}
-
 /*
  * Called with IRQs disabled
  */
-- 
cgit v1.1


From a096309bc4677f60caa8e93fcc613a55073c51d4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:50 +0100
Subject: perf: Fix scaling vs. perf_install_in_context()

Completely reworks perf_install_in_context() (again!) in order to
ensure that there will be no ctx time hole between add_event_to_ctx()
and any potential ctx_sched_in().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174948.279399438@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 115 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 70 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 57c25fa..25edabd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -276,10 +276,10 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
 		return;
 	}
 
-again:
 	if (task == TASK_TOMBSTONE)
 		return;
 
+again:
 	if (!task_function_call(task, event_function, &efs))
 		return;
 
@@ -289,13 +289,15 @@ again:
 	 * a concurrent perf_event_context_sched_out().
 	 */
 	task = ctx->task;
-	if (task != TASK_TOMBSTONE) {
-		if (ctx->is_active) {
-			raw_spin_unlock_irq(&ctx->lock);
-			goto again;
-		}
-		func(event, NULL, ctx, data);
+	if (task == TASK_TOMBSTONE) {
+		raw_spin_unlock_irq(&ctx->lock);
+		return;
+	}
+	if (ctx->is_active) {
+		raw_spin_unlock_irq(&ctx->lock);
+		goto again;
 	}
+	func(event, NULL, ctx, data);
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
@@ -2116,49 +2118,68 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 /*
  * Cross CPU call to install and enable a performance event
  *
- * Must be called with ctx->mutex held
+ * Very similar to remote_function() + event_function() but cannot assume that
+ * things like ctx->is_active and cpuctx->task_ctx are set.
  */
 static int  __perf_install_in_context(void *info)
 {
-	struct perf_event_context *ctx = info;
+	struct perf_event *event = info;
+	struct perf_event_context *ctx = event->ctx;
 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 	struct perf_event_context *task_ctx = cpuctx->task_ctx;
+	bool activate = true;
+	int ret = 0;
 
 	raw_spin_lock(&cpuctx->ctx.lock);
 	if (ctx->task) {
 		raw_spin_lock(&ctx->lock);
-		/*
-		 * If we hit the 'wrong' task, we've since scheduled and
-		 * everything should be sorted, nothing to do!
-		 */
 		task_ctx = ctx;
-		if (ctx->task != current)
+
+		/* If we're on the wrong CPU, try again */
+		if (task_cpu(ctx->task) != smp_processor_id()) {
+			ret = -ESRCH;
 			goto unlock;
+		}
 
 		/*
-		 * If task_ctx is set, it had better be to us.
+		 * If we're on the right CPU, see if the task we target is
+		 * current, if not we don't have to activate the ctx, a future
+		 * context switch will do that for us.
 		 */
-		WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx);
+		if (ctx->task != current)
+			activate = false;
+		else
+			WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
+
 	} else if (task_ctx) {
 		raw_spin_lock(&task_ctx->lock);
 	}
 
-	ctx_resched(cpuctx, task_ctx);
+	if (activate) {
+		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+		add_event_to_ctx(event, ctx);
+		ctx_resched(cpuctx, task_ctx);
+	} else {
+		add_event_to_ctx(event, ctx);
+	}
+
 unlock:
 	perf_ctx_unlock(cpuctx, task_ctx);
 
-	return 0;
+	return ret;
 }
 
 /*
- * Attach a performance event to a context
+ * Attach a performance event to a context.
+ *
+ * Very similar to event_function_call, see comment there.
  */
 static void
 perf_install_in_context(struct perf_event_context *ctx,
 			struct perf_event *event,
 			int cpu)
 {
-	struct task_struct *task = NULL;
+	struct task_struct *task = READ_ONCE(ctx->task);
 
 	lockdep_assert_held(&ctx->mutex);
 
@@ -2166,42 +2187,46 @@ perf_install_in_context(struct perf_event_context *ctx,
 	if (event->cpu != -1)
 		event->cpu = cpu;
 
+	if (!task) {
+		cpu_function_call(cpu, __perf_install_in_context, event);
+		return;
+	}
+
+	/*
+	 * Should not happen, we validate the ctx is still alive before calling.
+	 */
+	if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
+		return;
+
 	/*
 	 * Installing events is tricky because we cannot rely on ctx->is_active
 	 * to be set in case this is the nr_events 0 -> 1 transition.
-	 *
-	 * So what we do is we add the event to the list here, which will allow
-	 * a future context switch to DTRT and then send a racy IPI. If the IPI
-	 * fails to hit the right task, this means a context switch must have
-	 * happened and that will have taken care of business.
 	 */
-	raw_spin_lock_irq(&ctx->lock);
-	task = ctx->task;
-
+again:
 	/*
-	 * If between ctx = find_get_context() and mutex_lock(&ctx->mutex) the
-	 * ctx gets destroyed, we must not install an event into it.
-	 *
-	 * This is normally tested for after we acquire the mutex, so this is
-	 * a sanity check.
+	 * Cannot use task_function_call() because we need to run on the task's
+	 * CPU regardless of whether its current or not.
 	 */
+	if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
+		return;
+
+	raw_spin_lock_irq(&ctx->lock);
+	task = ctx->task;
 	if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
+		/*
+		 * Cannot happen because we already checked above (which also
+		 * cannot happen), and we hold ctx->mutex, which serializes us
+		 * against perf_event_exit_task_context().
+		 */
 		raw_spin_unlock_irq(&ctx->lock);
 		return;
 	}
-
-	if (ctx->is_active) {
-		update_context_time(ctx);
-		update_cgrp_time_from_event(event);
-	}
-
-	add_event_to_ctx(event, ctx);
 	raw_spin_unlock_irq(&ctx->lock);
-
-	if (task)
-		task_function_call(task, __perf_install_in_context, ctx);
-	else
-		cpu_function_call(cpu, __perf_install_in_context, ctx);
+	/*
+	 * Since !ctx->is_active doesn't mean anything, we must IPI
+	 * unconditionally.
+	 */
+	goto again;
 }
 
 /*
-- 
cgit v1.1


From 0da4cf3e0a68c97ef811569804616a811f786729 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:51 +0100
Subject: perf: Robustify task_function_call()

Since there is no serialization between task_function_call() doing
task_curr() and the other CPU doing context switches, we could end
up not sending an IPI even if we had to.

And I'm not sure I still buy my own argument we're OK.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174948.340031200@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 25edabd..6146148 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -64,8 +64,17 @@ static void remote_function(void *data)
 	struct task_struct *p = tfc->p;
 
 	if (p) {
-		tfc->ret = -EAGAIN;
-		if (task_cpu(p) != smp_processor_id() || !task_curr(p))
+		/* -EAGAIN */
+		if (task_cpu(p) != smp_processor_id())
+			return;
+
+		/*
+		 * Now that we're on right CPU with IRQs disabled, we can test
+		 * if we hit the right task without races.
+		 */
+
+		tfc->ret = -ESRCH; /* No such (running) process */
+		if (p != current)
 			return;
 	}
 
@@ -92,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info)
 		.p	= p,
 		.func	= func,
 		.info	= info,
-		.ret	= -ESRCH, /* No such (running) process */
+		.ret	= -EAGAIN,
 	};
+	int ret;
 
-	if (task_curr(p))
-		smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+	do {
+		ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+		if (!ret)
+			ret = data.ret;
+	} while (ret == -EAGAIN);
 
-	return data.ret;
+	return ret;
 }
 
 /**
@@ -169,19 +182,6 @@ static bool is_kernel_event(struct perf_event *event)
  *    rely on ctx->is_active and therefore cannot use event_function_call().
  *    See perf_install_in_context().
  *
- * This is because we need a ctx->lock serialized variable (ctx->is_active)
- * to reliably determine if a particular task/context is scheduled in. The
- * task_curr() use in task_function_call() is racy in that a remote context
- * switch is not a single atomic operation.
- *
- * As is, the situation is 'safe' because we set rq->curr before we do the
- * actual context switch. This means that task_curr() will fail early, but
- * we'll continue spinning on ctx->is_active until we've passed
- * perf_event_task_sched_out().
- *
- * Without this ctx->lock serialized variable we could have race where we find
- * the task (and hence the context) would not be active while in fact they are.
- *
  * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
  */
 
@@ -212,7 +212,7 @@ static int event_function(void *info)
 	 */
 	if (ctx->task) {
 		if (ctx->task != current) {
-			ret = -EAGAIN;
+			ret = -ESRCH;
 			goto unlock;
 		}
 
-- 
cgit v1.1


From 6fe1f348b3dd1f700f9630562b7d38afd6949568 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 21 Jan 2016 22:24:16 +0100
Subject: sched/cgroup: Fix cgroup entity load tracking tear-down

When a cgroup's CPU runqueue is destroyed, it should remove its
remaining load accounting from its parent cgroup.

The current site for doing so it unsuited because its far too late and
unordered against other cgroup removal (->css_free() will be, but we're also
in an RCU callback).

Put it in the ->css_offline() callback, which is the start of cgroup
destruction, right after the group has been made unavailable to
userspace. The ->css_offline() callbacks are called in hierarchical order
after the following v4.4 commit:

  aa226ff4a1ce ("cgroup: make sure a parent css isn't offlined before its children")

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160121212416.GL6357@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  |  4 +---
 kernel/sched/fair.c  | 37 +++++++++++++++++++++----------------
 kernel/sched/sched.h |  2 +-
 3 files changed, 23 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9503d59..ab814bf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7860,11 +7860,9 @@ void sched_destroy_group(struct task_group *tg)
 void sched_offline_group(struct task_group *tg)
 {
 	unsigned long flags;
-	int i;
 
 	/* end participation in shares distribution */
-	for_each_possible_cpu(i)
-		unregister_fair_sched_group(tg, i);
+	unregister_fair_sched_group(tg);
 
 	spin_lock_irqsave(&task_group_lock, flags);
 	list_del_rcu(&tg->list);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56b7d4b..cce3303 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8234,11 +8234,8 @@ void free_fair_sched_group(struct task_group *tg)
 	for_each_possible_cpu(i) {
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
-		if (tg->se) {
-			if (tg->se[i])
-				remove_entity_load_avg(tg->se[i]);
+		if (tg->se)
 			kfree(tg->se[i]);
-		}
 	}
 
 	kfree(tg->cfs_rq);
@@ -8286,21 +8283,29 @@ err:
 	return 0;
 }
 
-void unregister_fair_sched_group(struct task_group *tg, int cpu)
+void unregister_fair_sched_group(struct task_group *tg)
 {
-	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
+	struct rq *rq;
+	int cpu;
 
-	/*
-	* Only empty task groups can be destroyed; so we can speculatively
-	* check on_list without danger of it being re-added.
-	*/
-	if (!tg->cfs_rq[cpu]->on_list)
-		return;
+	for_each_possible_cpu(cpu) {
+		if (tg->se[cpu])
+			remove_entity_load_avg(tg->se[cpu]);
 
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+		/*
+		 * Only empty task groups can be destroyed; so we can speculatively
+		 * check on_list without danger of it being re-added.
+		 */
+		if (!tg->cfs_rq[cpu]->on_list)
+			continue;
+
+		rq = cpu_rq(cpu);
+
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	}
 }
 
 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
@@ -8382,7 +8387,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	return 1;
 }
 
-void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+void unregister_fair_sched_group(struct task_group *tg) { }
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10f1637..30ea2d8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -313,7 +313,7 @@ extern int tg_nop(struct task_group *tg, void *data);
 
 extern void free_fair_sched_group(struct task_group *tg);
 extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
-extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
+extern void unregister_fair_sched_group(struct task_group *tg);
 extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 			struct sched_entity *se, int cpu,
 			struct sched_entity *parent);
-- 
cgit v1.1


From 48be3a67da7413d62e5efbcf2c73a9dddf61fb96 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 23 Feb 2016 13:28:22 +0100
Subject: sched/deadline: Always calculate end of period on sched_yield()

Steven noticed that occasionally a sched_yield() call would not result
in a wait for the next period edge as expected.

It turns out that when we call update_curr_dl() and end up with
delta_exec <= 0, we will bail early and fail to throttle.

Further inspection of the yield code revealed that yield_task_dl()
clearing dl.runtime is wrong too, it will not account the last bit of
runtime which could result in dl.runtime < 0, which in turn means that
replenish would gift us with too much runtime.

Fix both issues by not relying on the dl.runtime value for yield.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Clark Williams <williams@redhat.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Juri Lelli <juri.lelli@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160223122822.GP6357@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 57b939c..04a569c 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -399,6 +399,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
 		dl_se->runtime = pi_se->dl_runtime;
 	}
 
+	if (dl_se->dl_yielded && dl_se->runtime > 0)
+		dl_se->runtime = 0;
+
 	/*
 	 * We keep moving the deadline away until we get some
 	 * available runtime for the entity. This ensures correct
@@ -735,8 +738,11 @@ static void update_curr_dl(struct rq *rq)
 	 * approach need further study.
 	 */
 	delta_exec = rq_clock_task(rq) - curr->se.exec_start;
-	if (unlikely((s64)delta_exec <= 0))
+	if (unlikely((s64)delta_exec <= 0)) {
+		if (unlikely(dl_se->dl_yielded))
+			goto throttle;
 		return;
+	}
 
 	schedstat_set(curr->se.statistics.exec_max,
 		      max(curr->se.statistics.exec_max, delta_exec));
@@ -749,8 +755,10 @@ static void update_curr_dl(struct rq *rq)
 
 	sched_rt_avg_update(rq, delta_exec);
 
-	dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
-	if (dl_runtime_exceeded(dl_se)) {
+	dl_se->runtime -= delta_exec;
+
+throttle:
+	if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
 		dl_se->dl_throttled = 1;
 		__dequeue_task_dl(rq, curr, 0);
 		if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
@@ -994,18 +1002,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
  */
 static void yield_task_dl(struct rq *rq)
 {
-	struct task_struct *p = rq->curr;
-
 	/*
 	 * We make the task go to sleep until its current deadline by
 	 * forcing its runtime to zero. This way, update_curr_dl() stops
 	 * it and the bandwidth timer will wake it up and will give it
 	 * new scheduling parameters (thanks to dl_yielded=1).
 	 */
-	if (p->dl.runtime > 0) {
-		rq->curr->dl.dl_yielded = 1;
-		p->dl.runtime = 0;
-	}
+	rq->curr->dl.dl_yielded = 1;
+
 	update_rq_clock(rq);
 	update_curr_dl(rq);
 	/*
-- 
cgit v1.1